import logging import os.path import re import string import textract from django.db.models import F, Q, Value from django.db.models.fields import TextField from django.db.models.fields.files import FieldFile from django.db.models.functions import Concat from django.template import loader from haystack.constants import Indexable from haystack.fields import CharField from haystack.indexes import SearchIndex from haystack.utils import get_model_ct_tuple from textract.exceptions import ExtensionNotSupported from sapl.compilacao.models import (STATUS_TA_IMMUTABLE_PUBLIC, STATUS_TA_PUBLIC, Dispositivo, TextoArticulado) from sapl.materia.models import DocumentoAcessorio, MateriaLegislativa from sapl.norma.models import NormaJuridica from sapl.settings import BASE_DIR, SOLR_URL from sapl.utils import RemoveTag logger = logging.getLogger(BASE_DIR.name) class TextExtractField(CharField): def __init__(self, **kwargs): super().__init__(**kwargs) assert self.model_attr if not isinstance(self.model_attr, (list, tuple)): self.model_attr = (self.model_attr, ) def solr_extraction(self, arquivo): extracted_data = self._get_backend(None).extract_file_contents( arquivo)['contents'] # Remove as tags xml extracted_data = re.sub('<[^>]*>', '', extracted_data) # Remove tags \t e \n extracted_data = extracted_data.replace( '\n', ' ').replace('\t', ' ') # Remove sinais de pontuação extracted_data = re.sub('[' + string.punctuation + ']', ' ', extracted_data) # Remove espaços múltiplos extracted_data = " ".join(extracted_data.split()) return extracted_data def whoosh_extraction(self, arquivo): if arquivo.path.endswith('html') or arquivo.path.endswith('xml'): with open(arquivo.path, 'r', encoding="utf8", errors='ignore') as f: content = ' '.join(f.read()) return RemoveTag(content) else: return textract.process( arquivo.path, language='pt-br').decode('utf-8').replace('\n', ' ').replace( '\t', ' ') def print_error(self, arquivo): msg = 'Erro inesperado processando arquivo: %s' % ( arquivo.path) print(msg) logger.error(msg) def file_extractor(self, arquivo): if not os.path.exists(arquivo.path) or \ not os.path.splitext(arquivo.path)[1][:1]: return '' # Em ambiente de produção utiliza-se o SOLR if SOLR_URL: try: return self.solr_extraction(arquivo) except Exception: self.print_error(arquivo) # Em ambiente de DEV utiliza-se o Whoosh # Como ele não possui extração, faz-se uso do textract else: try: return self.whoosh_extraction(arquivo) except ExtensionNotSupported as e: print(str(e)) logger.error(str(e)) except Exception as e2: print(str(e2)) self.print_error(arquivo) return '' def ta_extractor(self, value): r = [] for ta in value.filter(privacidade__in=[ STATUS_TA_PUBLIC, STATUS_TA_IMMUTABLE_PUBLIC]): dispositivos = Dispositivo.objects.filter( Q(ta=ta) | Q(ta_publicado=ta) ).order_by( 'ordem' ).annotate( rotulo_texto=Concat( F('rotulo'), Value(' '), F('texto'), output_field=TextField(), ) ).values_list( 'rotulo_texto', flat=True) r += list(filter(lambda x: x.strip(), dispositivos)) return ' '.join(r) def string_extractor(self, value): return value def extract_data(self, obj): data = '' for attr, func in self.model_attr: if not hasattr(obj, attr) or not hasattr(self, func): raise Exception value = getattr(obj, attr) if not value: continue data += getattr(self, func)(value) return data def prepare_template(self, obj): app_label, model_name = get_model_ct_tuple(obj) template_names = ['search/indexes/%s/%s_%s.txt' % (app_label, model_name, self.instance_name)] t = loader.select_template(template_names) return t.render({'object': obj, 'extracted': self.extract_data(obj)}) class DocumentoAcessorioIndex(SearchIndex, Indexable): model = DocumentoAcessorio text = TextExtractField( document=True, use_template=True, model_attr=( ('arquivo', 'file_extractor'), ('ementa', 'string_extractor'), ('indexacao', 'string_extractor'), ) ) def get_model(self): return self.model def index_queryset(self, using=None): return self.get_model().objects.all() def get_updated_field(self): return 'data_ultima_atualizacao' class NormaJuridicaIndex(DocumentoAcessorioIndex): model = NormaJuridica text = TextExtractField( document=True, use_template=True, model_attr=( ('texto_integral', 'file_extractor'), ('texto_articulado', 'ta_extractor'), ('ementa', 'string_extractor'), ('indexacao', 'string_extractor'), ('observacao', 'string_extractor'), ) ) class MateriaLegislativaIndex(DocumentoAcessorioIndex): model = MateriaLegislativa text = TextExtractField( document=True, use_template=True, model_attr=( ('texto_original', 'file_extractor'), ('texto_articulado', 'ta_extractor'), ('ementa', 'string_extractor'), ('indexacao', 'string_extractor'), ('observacao', 'string_extractor'), ) )