diff --git a/Dockerfile b/Dockerfile index 3f3adc78e..ffb812d6b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,9 @@ FROM alpine:3.8 ENV BUILD_PACKAGES postgresql-dev graphviz-dev graphviz build-base git pkgconfig \ - python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev \ - nodejs npm py3-lxml py3-magic postgresql-client poppler-utils antiword vim openssh-client + python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev \ + nodejs npm py3-lxml py3-magic postgresql-client poppler-utils antiword \ + curl jq openssh-client vim openssh-client RUN apk update --update-cache && apk upgrade diff --git a/sapl/base/search_indexes.py b/sapl/base/search_indexes.py index f1ec87ddd..0e0283ba8 100644 --- a/sapl/base/search_indexes.py +++ b/sapl/base/search_indexes.py @@ -1,6 +1,4 @@ import os.path -import re -import string import textract import logging @@ -8,6 +6,7 @@ from django.db.models import F, Q, Value from django.db.models.fields import TextField from django.db.models.functions import Concat from django.template import loader +from haystack import connections from haystack.constants import Indexable from haystack.fields import CharField from haystack.indexes import SearchIndex @@ -24,6 +23,7 @@ from sapl.utils import RemoveTag class TextExtractField(CharField): + backend = None logger = logging.getLogger(__name__) def __init__(self, **kwargs): @@ -34,24 +34,20 @@ class TextExtractField(CharField): self.model_attr = (self.model_attr, ) def solr_extraction(self, arquivo): - extracted_data = self._get_backend(None).extract_file_contents( - arquivo)['contents'] - # Remove as tags xml - self.logger.debug("Removendo as tags xml.") - extracted_data = re.sub('<[^>]*>', '', extracted_data) - # Remove tags \t e \n - self.logger.debug("Removendo as \t e \n.") - extracted_data = extracted_data.replace( - '\n', ' ').replace('\t', ' ') - # Remove sinais de pontuação - self.logger.debug("Removendo sinais de pontuação.") - extracted_data = re.sub('[' + string.punctuation + ']', - ' ', extracted_data) - # Remove espaços múltiplos - self.logger.debugger("Removendo espaços múltiplos.") - extracted_data = " ".join(extracted_data.split()) - - return extracted_data + if not self.backend: + self.backend = connections['default'].get_backend() + try: + with open(arquivo.path, 'rb') as f: + content = self.backend.extract_file_contents(f) + if not content or not content['contents']: + return '' + data = content['contents'] + except Exception as e: + print('erro processando arquivo: ' % arquivo.path) + self.logger.error(arquivo.path) + self.logger.error('erro processando arquivo: ' % arquivo.path) + data = '' + return data def whoosh_extraction(self, arquivo): @@ -66,11 +62,11 @@ class TextExtractField(CharField): language='pt-br').decode('utf-8').replace('\n', ' ').replace( '\t', ' ') - def print_error(self, arquivo): - self.logger.error("Erro inesperado processando arquivo: {}".format(arquivo.path)) - msg = 'Erro inesperado processando arquivo: %s' % ( - arquivo.path) - print(msg) + def print_error(self, arquivo, error): + msg = 'Erro inesperado processando arquivo %s erro: %s' % ( + arquivo.path, error) + print(msg, error) + self.logger.error(msg, error) def file_extractor(self, arquivo): if not os.path.exists(arquivo.path) or \ @@ -81,9 +77,9 @@ class TextExtractField(CharField): if SOLR_URL: try: return self.solr_extraction(arquivo) - except Exception as e: - self.logger.error("Erro no arquivo {}. ".format(arquivo.path) + str(e)) - self.print_error(arquivo) + except Exception as err: + print(str(err)) + self.print_error(arquivo, err) # Em ambiente de DEV utiliza-se o Whoosh # Como ele não possui extração, faz-se uso do textract @@ -91,13 +87,13 @@ class TextExtractField(CharField): try: self.logger.debug("Tentando whoosh_extraction no arquivo {}".format(arquivo.path)) return self.whoosh_extraction(arquivo) - except ExtensionNotSupported as e: - self.logger.error("Erro no arquivo {}".format(arquivo.path) + str(e)) - print(str(e)) - except Exception as e2: - self.logger.error(str(e)) - print(str(e2)) self.print_error(arquivo) + except ExtensionNotSupported as err: + print(str(err)) + self.logger.error(str(err)) + except Exception as err: + print(str(err)) + self.print_error(arquivo, str(err)) return '' def ta_extractor(self, value): @@ -133,7 +129,9 @@ class TextExtractField(CharField): value = getattr(obj, attr) if not value: continue - data += getattr(self, func)(value) + data += getattr(self, func)(value) + ' ' + + data = data.replace('\n', ' ') return data @@ -159,6 +157,10 @@ class DocumentoAcessorioIndex(SearchIndex, Indexable): ) ) + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.text.search_index = self + def get_model(self): return self.model diff --git a/sapl/materia/views.py b/sapl/materia/views.py index 578357721..8af8a19e2 100644 --- a/sapl/materia/views.py +++ b/sapl/materia/views.py @@ -1810,6 +1810,8 @@ class MateriaLegislativaPesquisaView(FilterView): context['show_results'] = show_results_filter_set(qr) + context['USE_SOLR'] = settings.USE_SOLR if hasattr(settings, 'USE_SOLR') else False + return context diff --git a/sapl/norma/views.py b/sapl/norma/views.py index f7800c42f..0e0ed23e4 100644 --- a/sapl/norma/views.py +++ b/sapl/norma/views.py @@ -15,6 +15,7 @@ from django.views.generic import TemplateView, UpdateView from django.views.generic.base import RedirectView from django.views.generic.edit import FormView from django_filters.views import FilterView +from sapl import settings from sapl.base.models import AppConfig from sapl.compilacao.views import IntegracaoTaView from sapl.crud.base import (RP_DETAIL, RP_LIST, Crud, CrudAux, @@ -107,6 +108,7 @@ class NormaPesquisaView(FilterView): context['filter_url'] = ('&' + qr.urlencode()) if len(qr) > 0 else '' context['show_results'] = show_results_filter_set(qr) + context['USE_SOLR'] = settings.USE_SOLR if hasattr(settings, 'USE_SOLR') else False return context diff --git a/sapl/settings.py b/sapl/settings.py index d30b4df3c..0d6a452bc 100755 --- a/sapl/settings.py +++ b/sapl/settings.py @@ -100,23 +100,28 @@ INSTALLED_APPS = ( # FTS = Full Text Search # Desabilita a indexação textual até encontramos uma solução para a issue # https://github.com/interlegis/sapl/issues/2055 -#HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' -HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor' +#HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor' # Disable auto index +HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' SEARCH_BACKEND = 'haystack.backends.whoosh_backend.WhooshEngine' SEARCH_URL = ('PATH', PROJECT_DIR.child('whoosh')) -SOLR_URL = config('SOLR_URL', cast=str, default='') -if SOLR_URL: +# SOLR +USE_SOLR = config('USE_SOLR', cast=bool, default=False) +SOLR_URL = config('SOLR_URL', cast=str, default='http://localhost:8983') +SOLR_COLLECTION = config('SOLR_COLLECTION', cast=str, default='sapl') + +if USE_SOLR: SEARCH_BACKEND = 'haystack.backends.solr_backend.SolrEngine' - SEARCH_URL = ('URL', config('SOLR_URL', cast=str)) - # ...or for multicore... - # 'URL': 'http://127.0.0.1:8983/solr/mysite', + SEARCH_URL = ('URL', '{}/solr/{}'.format(SOLR_URL, SOLR_COLLECTION)) +# BATCH_SIZE: default is 1000 if omitted, avoid Too Large Entity Body errors HAYSTACK_CONNECTIONS = { 'default': { 'ENGINE': SEARCH_BACKEND, - SEARCH_URL[0]: SEARCH_URL[1] + SEARCH_URL[0]: SEARCH_URL[1], + 'BATCH_SIZE': 500, + 'TIMEOUT': 60, }, } diff --git a/sapl/templates/materia/materialegislativa_filter.html b/sapl/templates/materia/materialegislativa_filter.html index cdd408af3..5ff3eaee0 100644 --- a/sapl/templates/materia/materialegislativa_filter.html +++ b/sapl/templates/materia/materialegislativa_filter.html @@ -3,11 +3,13 @@ {% load crispy_forms_tags %} {% block actions %} +
- + {% if USE_SOLR %} + + Pesquisa Textual + + {% endif %} {% if perms.materia.add_materialegislativa %} diff --git a/sapl/templates/norma/normajuridica_filter.html b/sapl/templates/norma/normajuridica_filter.html index ef668cf36..0c7547661 100644 --- a/sapl/templates/norma/normajuridica_filter.html +++ b/sapl/templates/norma/normajuridica_filter.html @@ -4,11 +4,11 @@ {% block actions %}
- + {% if USE_SOLR %} + + Pesquisa Textual + + {% endif %} {% if perms.norma.add_normajuridica %} diff --git a/solr/docker-compose.yml b/solr/docker-compose.yml new file mode 100644 index 000000000..2f97a7e10 --- /dev/null +++ b/solr/docker-compose.yml @@ -0,0 +1,61 @@ +version: '2' +services: + sapldb: + image: postgres:10.5-alpine + restart: always + environment: + POSTGRES_PASSWORD: sapl + POSTGRES_USER: sapl + POSTGRES_DB: sapl + PGDATA : /var/lib/postgresql/data/ + volumes: + - sapldb_data:/var/lib/postgresql/data/ + ports: + - "5432:5432" + + saplsolr: + image: solr:7.4-alpine + restart: always + command: bin/solr start -c -f + volumes: + - solr_data:/opt/solr/server/solr + - solr_configsets:/opt/solr/server/solr/configsets + ports: + - "8983:8983" + + sapl: + image: interlegis/sapl:3.1.138 + # build: . + restart: always + environment: + ADMIN_PASSWORD: interlegis + ADMIN_EMAIL: email@dominio.net + DEBUG: 'False' + USE_TLS: 'False' + EMAIL_PORT: 587 + EMAIL_HOST: smtp.dominio.net + EMAIL_HOST_USER: usuariosmtp + EMAIL_HOST_PASSWORD: senhasmtp + USE_SOLR: 'True' + #SOLR_COLLECTION: sapl + #SOLR_HOST: saplsolr + SOLR_URL: http://saplsolr:8983/solr/sapl + TZ: America/Sao_Paulo + volumes: + - sapl_data:/var/interlegis/sapl/data + - sapl_media:/var/interlegis/sapl/media + - sapl_root:/var/interlegis/sapl + volumes_from: + - saplsolr + depends_on: + - sapldb + - saplsolr + ports: + - "80:80" +volumes: + sapldb_data: + sapl_data: + sapl_media: + sapl_root: + solr_data: + solr_configsets: diff --git a/solr/sapl_configset/conf/lang/stopwords_en.txt b/solr/sapl_configset/conf/lang/stopwords_en.txt new file mode 100644 index 000000000..2c164c0b2 --- /dev/null +++ b/solr/sapl_configset/conf/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/solr/sapl_configset/conf/lang/stopwords_pt.txt b/solr/sapl_configset/conf/lang/stopwords_pt.txt new file mode 100644 index 000000000..acfeb01af --- /dev/null +++ b/solr/sapl_configset/conf/lang/stopwords_pt.txt @@ -0,0 +1,253 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/solr/sapl_configset/conf/managed-schema b/solr/sapl_configset/conf/managed-schema new file mode 100644 index 000000000..0cba1950a --- /dev/null +++ b/solr/sapl_configset/conf/managed-schema @@ -0,0 +1,573 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/sapl_configset/conf/params.json b/solr/sapl_configset/conf/params.json new file mode 100644 index 000000000..06114ef25 --- /dev/null +++ b/solr/sapl_configset/conf/params.json @@ -0,0 +1,20 @@ +{"params":{ + "query":{ + "defType":"edismax", + "q.alt":"*:*", + "rows":"10", + "fl":"*,score", + "":{"v":0} + }, + "facets":{ + "facet":"on", + "facet.mincount": "1", + "":{"v":0} + }, + "velocity":{ + "wt": "velocity", + "v.template":"browse", + "v.layout": "layout", + "":{"v":0} + } +}} \ No newline at end of file diff --git a/solr/sapl_configset/conf/protwords.txt b/solr/sapl_configset/conf/protwords.txt new file mode 100644 index 000000000..1dfc0abec --- /dev/null +++ b/solr/sapl_configset/conf/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/solr/sapl_configset/conf/saplconfigset.zip b/solr/sapl_configset/conf/saplconfigset.zip new file mode 100644 index 000000000..13a7a41ce Binary files /dev/null and b/solr/sapl_configset/conf/saplconfigset.zip differ diff --git a/solr/sapl_configset/conf/schema.xml b/solr/sapl_configset/conf/schema.xml new file mode 100644 index 000000000..597033929 --- /dev/null +++ b/solr/sapl_configset/conf/schema.xml @@ -0,0 +1,165 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + text + + + + diff --git a/solr/sapl_configset/conf/solrconfig.xml b/solr/sapl_configset/conf/solrconfig.xml new file mode 100644 index 000000000..9a9f29196 --- /dev/null +++ b/solr/sapl_configset/conf/solrconfig.xml @@ -0,0 +1,1367 @@ + + + + + + + + + 7.3.1 + + + + + + + + + + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + 300000 + false + + + + + + 30000 + + + + + + + + + + + + + + 1024 + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + text + edismax + + + + + + + + + + + + + + + + explicit + json + true + + + + + + + + explicit + + + + + + _text_ + + + + + + + true + ignored_ + _text_ + + + + + + + + + text_general + + + + + + default + _text_ + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + true + + + tvComponent + + + + + + + + + + + + true + false + + + terms + + + + + + + + string + + + + + + explicit + + + elevator + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + [^\w-\.] + _ + + + + + + + yyyy-MM-dd'T'HH:mm:ss.SSSZ + yyyy-MM-dd'T'HH:mm:ss,SSSZ + yyyy-MM-dd'T'HH:mm:ss.SSS + yyyy-MM-dd'T'HH:mm:ss,SSS + yyyy-MM-dd'T'HH:mm:ssZ + yyyy-MM-dd'T'HH:mm:ss + yyyy-MM-dd'T'HH:mmZ + yyyy-MM-dd'T'HH:mm + yyyy-MM-dd HH:mm:ss.SSSZ + yyyy-MM-dd HH:mm:ss,SSSZ + yyyy-MM-dd HH:mm:ss.SSS + yyyy-MM-dd HH:mm:ss,SSS + yyyy-MM-dd HH:mm:ssZ + yyyy-MM-dd HH:mm:ss + yyyy-MM-dd HH:mmZ + yyyy-MM-dd HH:mm + yyyy-MM-dd + + + + + java.lang.String + text_general + + *_str + 256 + + + true + + + java.lang.Boolean + booleans + + + java.util.Date + pdates + + + java.lang.Long + java.lang.Integer + plongs + + + java.lang.Number + pdoubles + + + + + + + + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + ${velocity.template.base.dir:} + ${velocity.solr.resource.loader.enabled:true} + ${velocity.params.resource.loader.enabled:false} + + + + + 5 + + + + + + + + + + + + + + diff --git a/solr/sapl_configset/conf/stopwords.txt b/solr/sapl_configset/conf/stopwords.txt new file mode 100644 index 000000000..ae1e83eeb --- /dev/null +++ b/solr/sapl_configset/conf/stopwords.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/solr/sapl_configset/conf/synonyms.txt b/solr/sapl_configset/conf/synonyms.txt new file mode 100644 index 000000000..eab4ee875 --- /dev/null +++ b/solr/sapl_configset/conf/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/solr_api.py b/solr_api.py new file mode 100755 index 000000000..9bec45daa --- /dev/null +++ b/solr_api.py @@ -0,0 +1,155 @@ + +import requests +import subprocess +import sys +import argparse + + +class SolrClient: + + LIST_CONFIGSETS = "{}/solr/admin/configs?action=LIST&omitHeader=true&wt=json" + UPLOAD_CONFIGSET = "{}/solr/admin/configs?action=UPLOAD&name={}&wt=json" + LIST_COLLECTIONS = "{}/solr/admin/collections?action=LIST&wt=json" + STATUS_COLLECTION = "{}/solr/admin/collections?action=CLUSTERSTATUS&collection={}&wt=json" + STATUS_CORE = "{}/admin/cores?action=STATUS&name={}" + EXISTS_COLLECTION = "{}/solr/{}/admin/ping?wt=json" + OPTIMIZE_COLLECTION = "{}/solr/{}/update?optimize=true&wt=json" + CREATE_COLLECTION = "{}/solr/admin/collections?action=CREATE&name={}&collection.configName={}&numShards={}&replicationFactor={}&maxShardsPerNode={}&wt=json" + DELETE_COLLECTION = "{}/solr/admin/collections?action=DELETE&name={}&wt=json" + DELETE_DATA = "{}/solr/{}/update?commitWithin=1000&overwrite=true&wt=json" + QUERY_DATA = "{}/solr/{}/select?q=*:*" + + CONFIGSET_NAME = "sapl_configset" + + def __init__(self, url): + self.url = url + + def get_num_docs(self, collection_name): + final_url = self.QUERY_DATA.format(self.url, collection_name) + res = requests.get(final_url) + dic = res.json() + num_docs = dic["response"]["numFound"] + return num_docs + + def list_collections(self): + req_url = self.LIST_COLLECTIONS.format(self.url) + res = requests.get(req_url) + dic = res.json() + return dic['collections'] + + def exists_collection(self, collection_name): + collections = self.list_collections() + return True if collection_name in collections else False + + def maybe_upload_configset(self, force=False): + req_url = self.LIST_CONFIGSETS.format(self.url) + res = requests.get(req_url) + dic = res.json() + configsets = dic['configSets'] + # UPLOAD configset + if not self.CONFIGSET_NAME in configsets or force: + files = {'file': ('saplconfigset.zip', + open('./solr/sapl_configset/conf/saplconfigset.zip', + 'rb'), + 'application/octet-stream', + {'Expires': '0'})} + + req_url = self.UPLOAD_CONFIGSET.format(self.url, self.CONFIGSET_NAME) + + resp = requests.post(req_url, files=files) + print(resp.content) + else: + print('O %s já presente no servidor, NÃO enviando.' % self.CONFIGSET_NAME) + + def create_collection(self, collection_name, shards=1, replication_factor=1, max_shards_per_node=1): + self.maybe_upload_configset() + req_url = self.CREATE_COLLECTION.format(self.url, + collection_name, + self.CONFIGSET_NAME, + shards, + replication_factor, + max_shards_per_node) + res = requests.post(req_url) + if res.ok: + print("Collection '%s' created succesfully" % collection_name) + else: + print("Error creating collection '%s'" % collection_name) + as_json = res.json() + print("Error %s: %s" % (res.status_code, as_json['error']['msg'])) + return False + return True + + def delete_collection(self, collection_name): + if collection_name == '*': + collections = self.list_collections() + else: + collections = [collection_name] + + for c in collections: + req_url = self.DELETE_COLLECTION.format(self.url, c) + res = requests.post(req_url) + if not res.ok: + print("Error deleting collection '%s'", c) + print("Code {}: {}".format(res.status_code, res.text)) + else: + print("Collection '%s' deleted successfully!" % c) + + def delete_index_data(self, collection_name): + req_url = self.DELETE_DATA.format(self.url, collection_name) + res = requests.post(req_url, + data='*:*', + headers={'Content-Type': 'application/xml'}) + if not res.ok: + print("Error deleting index for collection '%s'", collection_name) + print("Code {}: {}".format(res.status_code, res.text)) + else: + print("Collection '%s' data deleted successfully!" % collection_name) + + num_docs = self.get_num_docs(collection_name) + print("Num docs: %s" % num_docs) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Cria uma collection no Solr') + + # required arguments + parser.add_argument('-u', type=str, metavar='URL', nargs=1, dest='url', + required=True, help='Endereço do servidor Solr na forma http(s)://
[:port]') + parser.add_argument('-c', type=str, metavar='COLLECTION', dest='collection', nargs=1, + required=True, help='Collection Solr a ser criada') + + # optional arguments + parser.add_argument('-s', type=int, dest='shards', nargs='?', + help='Number of shards (default=1)', default=1) + parser.add_argument('-rf', type=int, dest='replication_factor', nargs='?', + help='Replication factor (default=1)', default=1) + parser.add_argument('-ms', type=int, dest='max_shards_per_node', nargs='?', + help='Max shards per node (default=1)', default=1) + + try: + args = parser.parse_args() + except IOError as msg: + parser.error(str(msg)) + sys.exit(-1) + + url = args.url.pop() + collection = args.collection.pop() + + client = SolrClient(url=url) + + if not client.exists_collection(collection): + print("Collection '%s' doesn't exists. Creating a new one..." % collection) + created = client.create_collection(collection, + shards=args.shards, + replication_factor=args.replication_factor, + max_shards_per_node=args.max_shards_per_node) + if not created: + sys.exit(-1) + else: + print("Collection '%s' exists." % collection) + + num_docs = client.get_num_docs(collection) + if num_docs == 0: + print("Performing a full reindex of '%s' collection..." % collection) + p = subprocess.call(["python3", "manage.py", "rebuild_index", "--noinput"]) diff --git a/start.sh b/start.sh index 9695572ef..865c37079 100755 --- a/start.sh +++ b/start.sh @@ -36,6 +36,10 @@ create_env() { echo "EMAIL_SEND_USER = ""${EMAIL_HOST_USER-''}" >> $FILENAME echo "DEFAULT_FROM_EMAIL = ""${EMAIL_HOST_USER-''}" >> $FILENAME echo "SERVER_EMAIL = ""${EMAIL_HOST_USER-''}" >> $FILENAME + echo "USE_SOLR = ""${USER_SOLR-True}" >> $FILENAME + echo "SOLR_COLLECTION = ""${SOLR_COLLECTION-'sapl'}" >> $FILENAME + echo "SOLR_URL = ""${SOLR_URL-'http://saplsolr:8983'}" >> $FILENAME + echo "[ENV FILE] done." } @@ -46,10 +50,22 @@ create_env /bin/sh busy-wait.sh $DATABASE_URL +## SOLR + +NUM_SHARDS=""${NUM_SHARDS-1}" +RF=""${RF-1}" +MAX_SHARDS_PER_NODE=""${NUM_SHARDS-1}" + +# Verifica se a variável USE_SOLR foi definida e é igual a True +if [[ ! -z "$USE_SOLR" ]] && [[ "$USE_SOLR" = "True" ]]; then + python3 solr_api.py -u $SOLR_URL -c $SOLR_COLLECTION -s $NUM_SHARDS -rf $RF -ms $MAX_SHARDS_PER_NODE & + # python3 manage.py rebuild_index --noinput & +fi + # manage.py migrate --noinput nao funcionava yes yes | python3 manage.py migrate #python3 manage.py collectstatic --no-input -# python3 manage.py rebuild_index --noinput & + echo "Criando usuário admin..."