From 8cdbed8474e113e7822ca4871a183cc48622e8a2 Mon Sep 17 00:00:00 2001
From: Eduardo Calil <dudusaid@gmail.com>
Date: Tue, 4 Jul 2017 13:39:42 -0300
Subject: [PATCH] =?UTF-8?q?Melhora=20o=20c=C3=B3digo=20de=20indexa=C3=A7?=
 =?UTF-8?q?=C3=A3o?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sapl/base/search_indexes.py | 60 +++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/sapl/base/search_indexes.py b/sapl/base/search_indexes.py
index 5ed9a97cf..bee7a54e3 100644
--- a/sapl/base/search_indexes.py
+++ b/sapl/base/search_indexes.py
@@ -28,6 +28,34 @@ class DocumentoAcessorioIndex(indexes.SearchIndex, indexes.Indexable):
     def index_queryset(self, using=None):
         return self.get_model().objects.all()
 
+    def solr_extraction(self, arquivo):
+        extracted_data = self._get_backend(None).extract_file_contents(
+            arquivo)['contents']
+        # Remove as tags xml
+        extracted_data = re.sub('<[^>]*>', '', extracted_data)
+        # Remove tags \t e \n
+        extracted_data = extracted_data.replace(
+            '\n', ' ').replace('\t', ' ')
+        # Remove sinais de pontuação
+        extracted_data = re.sub('[' + string.punctuation + ']',
+                                ' ', extracted_data)
+        # Remove espaços múltiplos
+        extracted_data = " ".join(extracted_data.split())
+
+        return extracted_data
+
+    def whoosh_extraction(self, arquivo):
+        return textract.process(
+            arquivo.path,
+            language='pt-br').decode('utf-8').replace('\n', ' ').replace(
+            '\t', ' ')
+
+    def print_error(self, arquivo):
+        msg = 'Erro inesperado processando arquivo: %s' % (
+            arquivo.path)
+        print(msg)
+        logger.error(msg)
+
     def prepare(self, obj):
         if not self.filename or not self.model or not self.template_name:
             raise Exception
@@ -45,37 +73,25 @@ class DocumentoAcessorioIndex(indexes.SearchIndex, indexes.Indexable):
 
             # Em ambiente de produção utiliza-se o SOLR
             if SOLR_URL:
-                extracted_data = self._get_backend(None).extract_file_contents(
-                    arquivo)['contents']
-                # Remove as tags xml
-                extracted_data = re.sub('<[^>]*>', '', extracted_data)
-                # Remove tags \t e \n
-                extracted_data = extracted_data.replace(
-                    '\n', ' ').replace('\t', ' ')
-                # Remove sinais de pontuação
-                extracted_data = re.sub('[' + string.punctuation + ']',
-                                        ' ', extracted_data)
-                # Remove espaços múltiplos
-                extracted_data = " ".join(extracted_data.split())
+                try:
+                    extracted_data = self.solr_extraction(arquivo)
+                except Exception:
+                    self.print_error(arquivo)
+                    return self.prepared_data
 
             # Em ambiente de DEV utiliza-se o Whoosh
             # Como ele não possui extração, faz-se uso do textract
             else:
                 try:
-                    extracted_data = textract.process(
-                        arquivo.path,
-                        language='pt-br').decode('utf-8').replace('\n', ' ')
-                except ExtensionNotSupported:
+                    extracted_data = self.whoosh_extraction(arquivo)
+                except ExtensionNotSupported as e:
+                    print(str(e))
+                    logger.error(str(e))
                     return self.prepared_data
                 except Exception:
-                    msg = 'Erro inesperado processando arquivo: %s' % (
-                        arquivo.path)
-                    print(msg)
-                    logger.error(msg)
+                    self.print_error(arquivo)
                     return self.prepared_data
 
-                extracted_data = extracted_data.replace('\t', ' ')
-
             # Now we'll finally perform the template processing to render the
             # text field with *all* of our metadata visible for templating:
             t = loader.select_template((