From fb0753d0909556ec87d3cad71822cf3bd5a71dad Mon Sep 17 00:00:00 2001 From: eduardocalil Date: Thu, 30 Mar 2017 14:44:40 -0300 Subject: [PATCH] Fix #809 full text search (#980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Inicia a pesquisa textual * Melhora o front-end da pesquisa textual * Bug fix * Ajusta alguns detalhes e cria botão de acesso a pesquisa * Adiciona signals para atualizar o index * Inicia a pesquisa textual * Melhora o front-end da pesquisa textual * Bug fix * Ajusta alguns detalhes e cria botão de acesso a pesquisa * Adiciona signals para atualizar o index * Insere ao README o comando de indexação * Atualiza versão do Haystack * Remove pacote haystack antigo Coloca url de search no padrão --- .gitignore | 1 + docs/instacao31.rst | 3 + requirements/requirements.txt | 3 + sapl/base/search_indexes.py | 58 +++++++++++ sapl/base/templatetags/common_tags.py | 13 ++- sapl/base/urls.py | 2 + sapl/materia/apps.py | 3 + sapl/materia/signals.py | 29 ++++++ sapl/settings.py | 11 +++ .../materia/materialegislativa_filter.html | 4 + .../materia/documentoacessorio_text.txt | 7 ++ .../materia/materialegislativa_text.txt | 7 ++ sapl/templates/search/search.html | 95 +++++++++++++++++++ 13 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 sapl/base/search_indexes.py create mode 100644 sapl/materia/signals.py create mode 100644 sapl/templates/search/indexes/materia/documentoacessorio_text.txt create mode 100644 sapl/templates/search/indexes/materia/materialegislativa_text.txt create mode 100644 sapl/templates/search/search.html diff --git a/.gitignore b/.gitignore index 38805b350..83de47a86 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,4 @@ collected_static bower bower_components media +whoosh/ \ No newline at end of file diff --git a/docs/instacao31.rst b/docs/instacao31.rst index 3d921a4a6..3d6a8b3eb 100644 --- a/docs/instacao31.rst +++ b/docs/instacao31.rst @@ -187,6 +187,9 @@ Copie a chave que aparecerá, edite o arquivo .env e altere o valor do parâmetr ./manage.py collectstatic --noinput +* Preparar o ambiente para indexação de arquivos:: + ./manage.py rebuild_index + * Subir o servidor do django:: ./manage.py runserver 0.0.0.0:8001 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 36ed3844b..6d7592cda 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,4 +1,5 @@ dj-database-url==0.4.1 +django-haystack==2.6.0 django>=1.9,<1.10 # TODO O django-admin-bootstrapped 2.5.7 não inseriu a mudança que permite # a compatibilidade com Django 1.9+. A linha abaixo será mudada quando uma @@ -25,7 +26,9 @@ python-decouple==3.0 pytz==2016.4 pyyaml==3.11 rtyaml==0.0.3 +textract==1.5.0 unipath==1.1 python-magic==0.4.12 gunicorn==19.6.0 django-reversion==2.0.8 +whoosh==2.7.4 diff --git a/sapl/base/search_indexes.py b/sapl/base/search_indexes.py new file mode 100644 index 000000000..1bb379091 --- /dev/null +++ b/sapl/base/search_indexes.py @@ -0,0 +1,58 @@ +import textract + +from haystack import indexes +from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa +from django.template import Context, loader + + +class DocumentoAcessorioIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + + filename = 'arquivo' + model = DocumentoAcessorio + template_name = 'materia/documentoacessorio_text.txt' + + def get_model(self): + return self.model + + def index_queryset(self, using=None): + return self.get_model().objects.all() + + def prepare(self, obj): + if not self.filename or not self.model or not self.template_name: + raise Exception + + data = super(DocumentoAcessorioIndex, self).prepare(obj) + + arquivo = getattr(obj, self.filename) + + if arquivo: + try: + arquivo.open() + except OSError: + return self.prepared_data + + extracted_data = textract.process( + arquivo.path).decode( + 'utf-8').replace('\n', ' ') + + extracted_data = extracted_data.replace('\t', ' ') + + # Now we'll finally perform the template processing to render the + # text field with *all* of our metadata visible for templating: + t = loader.select_template(( + 'search/indexes/' + self.template_name, )) + data['text'] = t.render(Context({'object': obj, + 'extracted': extracted_data})) + + return data + + return self.prepared_data + + +class MateriaLegislativaIndex(DocumentoAcessorioIndex): + text = indexes.CharField(document=True, use_template=True) + + filename = 'texto_original' + model = MateriaLegislativa + template_name = 'materia/materialegislativa_text.txt' diff --git a/sapl/base/templatetags/common_tags.py b/sapl/base/templatetags/common_tags.py index 9d643e70c..a908b7396 100644 --- a/sapl/base/templatetags/common_tags.py +++ b/sapl/base/templatetags/common_tags.py @@ -2,6 +2,7 @@ from compressor.utils import get_class from django import template from sapl.base.models import AppConfig +from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa from sapl.parlamentares.models import Filiacao register = template.Library() @@ -124,7 +125,7 @@ def url(value): @register.filter -def cronometro_to_seconds(value): +def cronometro_to_seconds(value): if not AppConfig.attr('cronometro_' + value): return 0 @@ -137,3 +138,13 @@ def cronometro_to_seconds(value): @register.filter def to_list_pk(object_list): return [o.pk for o in object_list] + + +@register.filter +def search_get_model(object): + if type(object) == MateriaLegislativa: + return 'm' + elif type(object) == DocumentoAcessorio: + return 'd' + + return None diff --git a/sapl/base/urls.py b/sapl/base/urls.py index 361ab2cda..d5e4ae98a 100644 --- a/sapl/base/urls.py +++ b/sapl/base/urls.py @@ -99,4 +99,6 @@ urlpatterns = [ name='login'), url(r'^logout/$', views.logout, {'next_page': '/login'}, name='logout'), + url(r'^sistema/search/', include('haystack.urls')), + ] + recuperar_senha diff --git a/sapl/materia/apps.py b/sapl/materia/apps.py index 3ac053d1b..ecc8d09de 100644 --- a/sapl/materia/apps.py +++ b/sapl/materia/apps.py @@ -6,3 +6,6 @@ class AppConfig(apps.AppConfig): name = 'sapl.materia' label = 'materia' verbose_name = _('Matéria') + + def ready(self): + from . import signals diff --git a/sapl/materia/signals.py b/sapl/materia/signals.py new file mode 100644 index 000000000..96ff85dea --- /dev/null +++ b/sapl/materia/signals.py @@ -0,0 +1,29 @@ +from django.db.models.signals import post_delete, post_save +from sapl.settings import PROJECT_DIR +from subprocess import PIPE, call +from threading import Thread + + +from .models import MateriaLegislativa, DocumentoAcessorio + + +class UpdateIndexCommand(Thread): + def run(self): + call([PROJECT_DIR.child('manage.py'), 'update_index'], + stdout=PIPE) + + +def save_texto(sender, instance, **kwargs): + update_index = UpdateIndexCommand() + update_index.start() + + +def delete_texto(sender, instance, **kwargs): + update_index = UpdateIndexCommand() + update_index.start() + + +post_save.connect(save_texto, sender=MateriaLegislativa) +post_save.connect(save_texto, sender=DocumentoAcessorio) +post_delete.connect(delete_texto, sender=MateriaLegislativa) +post_delete.connect(delete_texto, sender=DocumentoAcessorio) diff --git a/sapl/settings.py b/sapl/settings.py index 37646e675..d9875994b 100644 --- a/sapl/settings.py +++ b/sapl/settings.py @@ -76,12 +76,23 @@ INSTALLED_APPS = ( 'crispy_forms', 'easy_thumbnails', 'floppyforms', + 'haystack', 'sass_processor', 'rest_framework', 'reversion', + 'whoosh', ) + SAPL_APPS + +HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine', + 'PATH': PROJECT_DIR.child('whoosh'), + }, +} + + if DEBUG: INSTALLED_APPS += ('debug_toolbar', 'rest_framework_docs',) diff --git a/sapl/templates/materia/materialegislativa_filter.html b/sapl/templates/materia/materialegislativa_filter.html index 639031d16..cd459c61c 100644 --- a/sapl/templates/materia/materialegislativa_filter.html +++ b/sapl/templates/materia/materialegislativa_filter.html @@ -4,6 +4,10 @@ {% block actions %}
+ + Pesquisa Textual + + {% if perms.materia %} {% blocktrans with verbose_name=view.verbose_name %} Adicionar Matéria Legislativa {% endblocktrans %} diff --git a/sapl/templates/search/indexes/materia/documentoacessorio_text.txt b/sapl/templates/search/indexes/materia/documentoacessorio_text.txt new file mode 100644 index 000000000..0f9218324 --- /dev/null +++ b/sapl/templates/search/indexes/materia/documentoacessorio_text.txt @@ -0,0 +1,7 @@ +{% for k, v in extracted.metadata.items %} + {% for val in v %} + {{ k }}: {{ val|safe }} + {% endfor %} +{% endfor %} + +{{ extracted|striptags|safe }} \ No newline at end of file diff --git a/sapl/templates/search/indexes/materia/materialegislativa_text.txt b/sapl/templates/search/indexes/materia/materialegislativa_text.txt new file mode 100644 index 000000000..0f9218324 --- /dev/null +++ b/sapl/templates/search/indexes/materia/materialegislativa_text.txt @@ -0,0 +1,7 @@ +{% for k, v in extracted.metadata.items %} + {% for val in v %} + {{ k }}: {{ val|safe }} + {% endfor %} +{% endfor %} + +{{ extracted|striptags|safe }} \ No newline at end of file diff --git a/sapl/templates/search/search.html b/sapl/templates/search/search.html new file mode 100644 index 000000000..bd925d37c --- /dev/null +++ b/sapl/templates/search/search.html @@ -0,0 +1,95 @@ +{% extends 'crud/form.html' %} +{% load crispy_forms_tags %} +{% load common_tags %} + +{% block base_content %} +

Pesquisa Textual

+ +
+ +
+
+ +
+
+ {{ form.q|as_crispy_field }} +
+
+ +
+
+
+

Em quais tipos de documento deseja pesquisar?

+
+
+ +
+
+ {{ form.models }} +
+
+ + +
+
+ +
+
+ +
+ +
+ {% if query %} + + + + + + {% for result in page.object_list %} + + + + + {% empty %} +

Nenhum texto encontrado!

+ + + + {% endfor %} +

Resultados

+ {% if result.object|search_get_model == 'm' %} +

+ Matéria Legislativa: {{ result.object }}
+ + {% if result.object.texto_original %} + Texto Original: Clique aqui
+ {% else %} + O texto desta matéria foi removido recentemente. Em breve ela sairá desta listagem.
+ {% endif %} +

+ + {% elif result.object|search_get_model == 'd' %} +

+ Documento Acessório: {{ result.object }}
+ {% if result.object.arquivo %} + Texto Original: Clique aqui
+ {% else %} + O texto deste documento foi removido recentemente. Em breve ele sairá desta listagem.
+ {% endif %} +

+ {% endif %} +
+
+ + {% if page.has_previous or page.has_next %} +
+ {% if page.has_previous %}{% endif %}« Previous{% if page.has_previous %}{% endif %} + | + {% if page.has_next %}{% endif %}Next »{% if page.has_next %}{% endif %} +
+ {% endif %} + {% else %} + {# Show some example queries to run, maybe query syntax, something else? #} + {% endif %} +
+{% endblock %}