Browse Source

Fix #1851 (#1853)

* Fix #1851

* Adciona unrtf para indexar .rtf

* Altera sintaxe
pull/1866/head
Victor Fabre 7 years ago
committed by Edward Ribeiro
parent
commit
5b506ff837
  1. 2
      Dockerfile
  2. 19
      sapl/base/search_indexes.py
  3. 22
      sapl/utils.py

2
Dockerfile

@ -2,7 +2,7 @@ FROM alpine:3.5
ENV BUILD_PACKAGES postgresql-dev graphviz-dev graphviz build-base git pkgconfig \
python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev nodejs py3-lxml \
py3-magic postgresql-client poppler-utils antiword vim
py3-magic postgresql-client poppler-utils antiword vim unrtf
RUN apk --update add fontconfig ttf-dejavu && fc-cache -fv

19
sapl/base/search_indexes.py

@ -21,6 +21,7 @@ from sapl.compilacao.models import (STATUS_TA_IMMUTABLE_PUBLIC,
from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa
from sapl.norma.models import NormaJuridica
from sapl.settings import BASE_DIR, SOLR_URL
from sapl.utils import RemoveTag
logger = logging.getLogger(BASE_DIR.name)
@ -51,10 +52,17 @@ class TextExtractField(CharField):
return extracted_data
def whoosh_extraction(self, arquivo):
return textract.process(
arquivo.path,
language='pt-br').decode('utf-8').replace('\n', ' ').replace(
'\t', ' ')
if arquivo.path.endswith('html') or arquivo.path.endswith('xml'):
with open(arquivo.path, 'r', encoding="utf8", errors='ignore') as f:
content = ' '.join(f.read())
return RemoveTag(content)
else:
return textract.process(
arquivo.path,
language='pt-br').decode('utf-8').replace('\n', ' ').replace(
'\t', ' ')
def print_error(self, arquivo):
msg = 'Erro inesperado processando arquivo: %s' % (
@ -82,7 +90,8 @@ class TextExtractField(CharField):
except ExtensionNotSupported as e:
print(str(e))
logger.error(str(e))
except Exception:
except Exception as e2:
print(str(e2))
self.print_error(arquivo)
return ''

22
sapl/utils.py

@ -707,12 +707,28 @@ def TrocaTag(texto, startTag, endTag, sizeStart, sizeEnd, styleName):
if (texto[i:i + sizeEnd] == endTag):
textoSaida += 'blockTable><para>'
insideTag = 0
i = i + sizeEnd
i += sizeEnd
else:
textoSaida += texto[i]
i = i + 1
i += 1
else:
textoSaida += texto[i]
i = i + 1
i += 1
return textoSaida
def RemoveTag(texto):
textoSaida = ''
i = 0
while (i < len(texto)):
if (texto[i] == '<'):
i = ExtraiTag(texto, i)
else:
textoSaida += texto[i]
i += 1
return textoSaida

Loading…
Cancel
Save