Browse Source

Consertar erros de indexação do no Solr 8.9 (#3503)

pull/3514/head
Gustavo274 3 years ago
committed by GitHub
parent
commit
8689a62a20
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      docker/docker-compose.yml
  2. 2
      requirements/requirements.txt
  3. 11
      sapl/base/search_indexes.py
  4. 37
      solr/docs_stats.py
  5. 4
      solr/sapl_configset/conf/managed-schema
  6. 2
      solr/sapl_configset/conf/solrconfig.xml

2
docker/docker-compose.yml

@ -18,7 +18,7 @@ services:
networks:
- sapl-net
saplsolr:
image: solr:8.3
image: solr:8.9
restart: always
command: bin/solr start -c -f
container_name: solr

2
requirements/requirements.txt

@ -1,5 +1,5 @@
django==2.2.24
django-haystack==2.8.1
django-haystack==3.1.1
django-filter==2.4.0
djangorestframework==3.12.4
dj-database-url==0.5.0

11
sapl/base/search_indexes.py

@ -38,9 +38,14 @@ class TextExtractField(CharField):
try:
with open(arquivo.path, 'rb') as f:
content = self.backend.extract_file_contents(f)
if not content or not content['contents']:
return ''
data = content['contents']
data = ''
if content:
# update from Solr 7.5 to 8.9
if content['contents']:
data += content['contents']
if content['file']:
data += content['file']
return data
except Exception as e:
print('erro processando arquivo: ' % arquivo.path)
self.logger.error(arquivo.path)

37
solr/docs_stats.py

@ -1,37 +0,0 @@
import requests
"""
Imprime quantidade de colletions, qtd de documentos por collection e
total de documentos indexados.
"""
BASE_URL='http://localhost:8983/solr'
if __name__=='__main__':
resp = requests.get(BASE_URL+'/admin/collections?action=LIST')
collections = sorted(resp.json()['collections'])
largest_col = (None,-1)
total_docs = 0
print("Collection\t\t\tNumber of documents")
print("--------------------------------------------------")
for c in collections:
r = requests.get(BASE_URL+'/{}/select?q=*:*&rows=0'.format(c))
num_docs = r.json()['response']['numFound']
total_docs += num_docs
if num_docs >= largest_col[1]:
largest_col = (c, num_docs)
print("%30s\t%6s" % (c, num_docs))
print("------------------------------------------")
print("- Number of collections: %s\n" % len(collections))
print("- Largest collection: '%s' (%s docs)\n" % largest_col)
print("- Total documents accross all collections: %s\n" % total_docs)

4
solr/sapl_configset/conf/managed-schema

@ -120,7 +120,7 @@
<field name="django_ct" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="django_id" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="text" type="text_pt" indexed="true" stored="true" multiValued="false" />
<field name="last_update" type="pdate" indexed="true" stored="true" default="NOW" />
<field name="last_update" type="pdate" indexed="true" stored="true" default="NOW" required="false" />
<!-- This can be enabled, in case the client does not know what fields may be searched. It isn't enabled by default
because it's very expensive to index everything twice. -->
@ -552,7 +552,7 @@
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" />
<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" /> -->
<filter class="solr.PortugueseLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->

2
solr/sapl_configset/conf/solrconfig.xml

@ -310,7 +310,7 @@
have some sort of hard autoCommit to limit the log size.
-->
<autoCommit>
<maxTime>180000</maxTime>
<maxTime>300000</maxTime>
<openSearcher>false</openSearcher>
</autoCommit>

Loading…
Cancel
Save