Browse Source

Utiliza Solr para extração de texto (#1219)

pull/1242/head
Eduardo Calil 8 years ago
committed by Edward
parent
commit
9a8ce5abff
  1. 52
      sapl/base/search_indexes.py

52
sapl/base/search_indexes.py

@ -1,17 +1,20 @@
import logging import logging
import os.path import os.path
import re
import string
import textract import textract
from django.template import Context, loader from django.template import loader
from haystack import indexes from haystack import indexes
from textract.exceptions import ExtensionNotSupported from textract.exceptions import ExtensionNotSupported
from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa
from sapl.norma.models import NormaJuridica from sapl.norma.models import NormaJuridica
from sapl.settings import BASE_DIR from sapl.settings import BASE_DIR, SOLR_URL
logger = logging.getLogger(BASE_DIR.name) logger = logging.getLogger(BASE_DIR.name)
class DocumentoAcessorioIndex(indexes.SearchIndex, indexes.Indexable): class DocumentoAcessorioIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True) text = indexes.CharField(document=True, use_template=True)
@ -40,19 +43,38 @@ class DocumentoAcessorioIndex(indexes.SearchIndex, indexes.Indexable):
if not os.path.splitext(arquivo.path)[1][:1]: if not os.path.splitext(arquivo.path)[1][:1]:
return self.prepared_data return self.prepared_data
try: # Em ambiente de produção utiliza-se o SOLR
extracted_data = textract.process( if SOLR_URL:
arquivo.path, extracted_data = self._get_backend(None).extract_file_contents(
language='pt-br').decode('utf-8').replace('\n', ' ') arquivo)['contents']
except ExtensionNotSupported: # Remove as tags xml
return self.prepared_data extracted_data = re.sub('<[^>]*>', '', extracted_data)
except Exception: # Remove tags \t e \n
msg = 'Erro inesperado processando arquivo: %s' % arquivo.path extracted_data = extracted_data.replace(
print(msg) '\n', ' ').replace('\t', ' ')
logger.error(msg) # Remove sinais de pontuação
return self.prepared_data extracted_data = re.sub('[' + string.punctuation + ']',
' ', extracted_data)
extracted_data = extracted_data.replace('\t', ' ') # Remove espaços múltiplos
extracted_data = " ".join(extracted_data.split())
# Em ambiente de DEV utiliza-se o Whoosh
# Como ele não possui extração, faz-se uso do textract
else:
try:
extracted_data = textract.process(
arquivo.path,
language='pt-br').decode('utf-8').replace('\n', ' ')
except ExtensionNotSupported:
return self.prepared_data
except Exception:
msg = 'Erro inesperado processando arquivo: %s' % (
arquivo.path)
print(msg)
logger.error(msg)
return self.prepared_data
extracted_data = extracted_data.replace('\t', ' ')
# Now we'll finally perform the template processing to render the # Now we'll finally perform the template processing to render the
# text field with *all* of our metadata visible for templating: # text field with *all* of our metadata visible for templating:

Loading…
Cancel
Save