Browse Source

Fix #1851 (#1853)

* Fix #1851

* Adciona unrtf para indexar .rtf

* Altera sintaxe
pull/1867/head
Victor Fabre 7 years ago
committed by Edward
parent
commit
312c112577
  1. 2
      Dockerfile
  2. 19
      sapl/base/search_indexes.py
  3. 22
      sapl/utils.py

2
Dockerfile

@ -2,7 +2,7 @@ FROM alpine:3.5
ENV BUILD_PACKAGES postgresql-dev graphviz-dev graphviz build-base git pkgconfig \ ENV BUILD_PACKAGES postgresql-dev graphviz-dev graphviz build-base git pkgconfig \
python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev nodejs py3-lxml \ python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev nodejs py3-lxml \
py3-magic postgresql-client poppler-utils antiword vim py3-magic postgresql-client poppler-utils antiword vim unrtf
RUN apk --update add fontconfig ttf-dejavu && fc-cache -fv RUN apk --update add fontconfig ttf-dejavu && fc-cache -fv

19
sapl/base/search_indexes.py

@ -21,6 +21,7 @@ from sapl.compilacao.models import (STATUS_TA_IMMUTABLE_PUBLIC,
from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa
from sapl.norma.models import NormaJuridica from sapl.norma.models import NormaJuridica
from sapl.settings import BASE_DIR, SOLR_URL from sapl.settings import BASE_DIR, SOLR_URL
from sapl.utils import RemoveTag
logger = logging.getLogger(BASE_DIR.name) logger = logging.getLogger(BASE_DIR.name)
@ -51,10 +52,17 @@ class TextExtractField(CharField):
return extracted_data return extracted_data
def whoosh_extraction(self, arquivo): def whoosh_extraction(self, arquivo):
return textract.process(
arquivo.path, if arquivo.path.endswith('html') or arquivo.path.endswith('xml'):
language='pt-br').decode('utf-8').replace('\n', ' ').replace( with open(arquivo.path, 'r', encoding="utf8", errors='ignore') as f:
'\t', ' ') content = ' '.join(f.read())
return RemoveTag(content)
else:
return textract.process(
arquivo.path,
language='pt-br').decode('utf-8').replace('\n', ' ').replace(
'\t', ' ')
def print_error(self, arquivo): def print_error(self, arquivo):
msg = 'Erro inesperado processando arquivo: %s' % ( msg = 'Erro inesperado processando arquivo: %s' % (
@ -82,7 +90,8 @@ class TextExtractField(CharField):
except ExtensionNotSupported as e: except ExtensionNotSupported as e:
print(str(e)) print(str(e))
logger.error(str(e)) logger.error(str(e))
except Exception: except Exception as e2:
print(str(e2))
self.print_error(arquivo) self.print_error(arquivo)
return '' return ''

22
sapl/utils.py

@ -707,12 +707,28 @@ def TrocaTag(texto, startTag, endTag, sizeStart, sizeEnd, styleName):
if (texto[i:i + sizeEnd] == endTag): if (texto[i:i + sizeEnd] == endTag):
textoSaida += 'blockTable><para>' textoSaida += 'blockTable><para>'
insideTag = 0 insideTag = 0
i = i + sizeEnd i += sizeEnd
else: else:
textoSaida += texto[i] textoSaida += texto[i]
i = i + 1 i += 1
else: else:
textoSaida += texto[i] textoSaida += texto[i]
i = i + 1 i += 1
return textoSaida
def RemoveTag(texto):
textoSaida = ''
i = 0
while (i < len(texto)):
if (texto[i] == '<'):
i = ExtraiTag(texto, i)
else:
textoSaida += texto[i]
i += 1
return textoSaida return textoSaida

Loading…
Cancel
Save