diff --git a/Dockerfile b/Dockerfile index 693230025..fab4204d8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM alpine:3.5 ENV BUILD_PACKAGES postgresql-dev graphviz-dev graphviz build-base git pkgconfig \ python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev nodejs py3-lxml \ -py3-magic postgresql-client poppler-utils antiword vim +py3-magic postgresql-client poppler-utils antiword vim unrtf RUN apk --update add fontconfig ttf-dejavu && fc-cache -fv diff --git a/sapl/base/search_indexes.py b/sapl/base/search_indexes.py index f685cb002..ed811cebb 100644 --- a/sapl/base/search_indexes.py +++ b/sapl/base/search_indexes.py @@ -21,6 +21,7 @@ from sapl.compilacao.models import (STATUS_TA_IMMUTABLE_PUBLIC, from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa from sapl.norma.models import NormaJuridica from sapl.settings import BASE_DIR, SOLR_URL +from sapl.utils import RemoveTag logger = logging.getLogger(BASE_DIR.name) @@ -51,10 +52,17 @@ class TextExtractField(CharField): return extracted_data def whoosh_extraction(self, arquivo): - return textract.process( - arquivo.path, - language='pt-br').decode('utf-8').replace('\n', ' ').replace( - '\t', ' ') + + if arquivo.path.endswith('html') or arquivo.path.endswith('xml'): + with open(arquivo.path, 'r', encoding="utf8", errors='ignore') as f: + content = ' '.join(f.read()) + return RemoveTag(content) + + else: + return textract.process( + arquivo.path, + language='pt-br').decode('utf-8').replace('\n', ' ').replace( + '\t', ' ') def print_error(self, arquivo): msg = 'Erro inesperado processando arquivo: %s' % ( @@ -82,7 +90,8 @@ class TextExtractField(CharField): except ExtensionNotSupported as e: print(str(e)) logger.error(str(e)) - except Exception: + except Exception as e2: + print(str(e2)) self.print_error(arquivo) return '' diff --git a/sapl/utils.py b/sapl/utils.py index 196e5b5cf..0fa20f706 100644 --- a/sapl/utils.py +++ b/sapl/utils.py @@ -707,12 +707,28 @@ def TrocaTag(texto, startTag, endTag, sizeStart, sizeEnd, styleName): if (texto[i:i + sizeEnd] == endTag): textoSaida += 'blockTable>' insideTag = 0 - i = i + sizeEnd + i += sizeEnd else: textoSaida += texto[i] - i = i + 1 + i += 1 else: textoSaida += texto[i] - i = i + 1 + i += 1 + + return textoSaida + + +def RemoveTag(texto): + textoSaida = '' + i = 0 + + while (i < len(texto)): + + if (texto[i] == '<'): + i = ExtraiTag(texto, i) + + else: + textoSaida += texto[i] + i += 1 return textoSaida