diff --git a/sapl/base/search_indexes.py b/sapl/base/search_indexes.py index f685cb002..ed811cebb 100644 --- a/sapl/base/search_indexes.py +++ b/sapl/base/search_indexes.py @@ -21,6 +21,7 @@ from sapl.compilacao.models import (STATUS_TA_IMMUTABLE_PUBLIC, from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa from sapl.norma.models import NormaJuridica from sapl.settings import BASE_DIR, SOLR_URL +from sapl.utils import RemoveTag logger = logging.getLogger(BASE_DIR.name) @@ -51,10 +52,17 @@ class TextExtractField(CharField): return extracted_data def whoosh_extraction(self, arquivo): - return textract.process( - arquivo.path, - language='pt-br').decode('utf-8').replace('\n', ' ').replace( - '\t', ' ') + + if arquivo.path.endswith('html') or arquivo.path.endswith('xml'): + with open(arquivo.path, 'r', encoding="utf8", errors='ignore') as f: + content = ' '.join(f.read()) + return RemoveTag(content) + + else: + return textract.process( + arquivo.path, + language='pt-br').decode('utf-8').replace('\n', ' ').replace( + '\t', ' ') def print_error(self, arquivo): msg = 'Erro inesperado processando arquivo: %s' % ( @@ -82,7 +90,8 @@ class TextExtractField(CharField): except ExtensionNotSupported as e: print(str(e)) logger.error(str(e)) - except Exception: + except Exception as e2: + print(str(e2)) self.print_error(arquivo) return '' diff --git a/sapl/utils.py b/sapl/utils.py index 196e5b5cf..8f123b2ef 100644 --- a/sapl/utils.py +++ b/sapl/utils.py @@ -716,3 +716,19 @@ def TrocaTag(texto, startTag, endTag, sizeStart, sizeEnd, styleName): i = i + 1 return textoSaida + + +def RemoveTag(texto): + textoSaida = '' + i = 0 + + while (i < len(texto)): + + if (texto[i] == '<'): + i = ExtraiTag(texto, i) + + else: + textoSaida += texto[i] + i = i + 1 + + return textoSaida \ No newline at end of file