Gustavo274
3 years ago
committed by
GitHub
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with
13 additions and
45 deletions
-
docker/docker-compose.yml
-
requirements/requirements.txt
-
sapl/base/search_indexes.py
-
solr/docs_stats.py
-
solr/sapl_configset/conf/managed-schema
-
solr/sapl_configset/conf/solrconfig.xml
|
|
@ -18,7 +18,7 @@ services: |
|
|
|
networks: |
|
|
|
- sapl-net |
|
|
|
saplsolr: |
|
|
|
image: solr:8.3 |
|
|
|
image: solr:8.9 |
|
|
|
restart: always |
|
|
|
command: bin/solr start -c -f |
|
|
|
container_name: solr |
|
|
|
|
|
@ -1,5 +1,5 @@ |
|
|
|
django==2.2.24 |
|
|
|
django-haystack==2.8.1 |
|
|
|
django-haystack==3.1.1 |
|
|
|
django-filter==2.4.0 |
|
|
|
djangorestframework==3.12.4 |
|
|
|
dj-database-url==0.5.0 |
|
|
|
|
|
@ -38,9 +38,14 @@ class TextExtractField(CharField): |
|
|
|
try: |
|
|
|
with open(arquivo.path, 'rb') as f: |
|
|
|
content = self.backend.extract_file_contents(f) |
|
|
|
if not content or not content['contents']: |
|
|
|
return '' |
|
|
|
data = content['contents'] |
|
|
|
data = '' |
|
|
|
if content: |
|
|
|
# update from Solr 7.5 to 8.9 |
|
|
|
if content['contents']: |
|
|
|
data += content['contents'] |
|
|
|
if content['file']: |
|
|
|
data += content['file'] |
|
|
|
return data |
|
|
|
except Exception as e: |
|
|
|
print('erro processando arquivo: ' % arquivo.path) |
|
|
|
self.logger.error(arquivo.path) |
|
|
|
|
|
@ -1,37 +0,0 @@ |
|
|
|
import requests |
|
|
|
|
|
|
|
""" |
|
|
|
Imprime quantidade de colletions, qtd de documentos por collection e |
|
|
|
total de documentos indexados. |
|
|
|
""" |
|
|
|
|
|
|
|
BASE_URL='http://localhost:8983/solr' |
|
|
|
|
|
|
|
|
|
|
|
if __name__=='__main__': |
|
|
|
|
|
|
|
resp = requests.get(BASE_URL+'/admin/collections?action=LIST') |
|
|
|
|
|
|
|
collections = sorted(resp.json()['collections']) |
|
|
|
|
|
|
|
largest_col = (None,-1) |
|
|
|
total_docs = 0 |
|
|
|
|
|
|
|
print("Collection\t\t\tNumber of documents") |
|
|
|
print("--------------------------------------------------") |
|
|
|
|
|
|
|
for c in collections: |
|
|
|
r = requests.get(BASE_URL+'/{}/select?q=*:*&rows=0'.format(c)) |
|
|
|
num_docs = r.json()['response']['numFound'] |
|
|
|
total_docs += num_docs |
|
|
|
|
|
|
|
if num_docs >= largest_col[1]: |
|
|
|
largest_col = (c, num_docs) |
|
|
|
|
|
|
|
print("%30s\t%6s" % (c, num_docs)) |
|
|
|
|
|
|
|
print("------------------------------------------") |
|
|
|
print("- Number of collections: %s\n" % len(collections)) |
|
|
|
print("- Largest collection: '%s' (%s docs)\n" % largest_col) |
|
|
|
print("- Total documents accross all collections: %s\n" % total_docs) |
|
|
|
|
|
|
@ -120,7 +120,7 @@ |
|
|
|
<field name="django_ct" type="string" indexed="true" stored="true" multiValued="false"/> |
|
|
|
<field name="django_id" type="string" indexed="true" stored="true" multiValued="false"/> |
|
|
|
<field name="text" type="text_pt" indexed="true" stored="true" multiValued="false" /> |
|
|
|
<field name="last_update" type="pdate" indexed="true" stored="true" default="NOW" /> |
|
|
|
<field name="last_update" type="pdate" indexed="true" stored="true" default="NOW" required="false" /> |
|
|
|
|
|
|
|
<!-- This can be enabled, in case the client does not know what fields may be searched. It isn't enabled by default |
|
|
|
because it's very expensive to index everything twice. --> |
|
|
@ -552,7 +552,7 @@ |
|
|
|
<charFilter class="solr.HTMLStripCharFilterFactory"/> |
|
|
|
<tokenizer class="solr.StandardTokenizerFactory"/> |
|
|
|
<filter class="solr.LowerCaseFilterFactory"/> |
|
|
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" /> |
|
|
|
<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" /> --> |
|
|
|
<filter class="solr.PortugueseLightStemFilterFactory"/> |
|
|
|
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> |
|
|
|
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> |
|
|
|
|
|
@ -310,7 +310,7 @@ |
|
|
|
have some sort of hard autoCommit to limit the log size. |
|
|
|
--> |
|
|
|
<autoCommit> |
|
|
|
<maxTime>180000</maxTime> |
|
|
|
<maxTime>300000</maxTime> |
|
|
|
<openSearcher>false</openSearcher> |
|
|
|
</autoCommit> |
|
|
|
|
|
|
|