From bd9b3475d66e1e865f10e620cb11d6a9b6b318f9 Mon Sep 17 00:00:00 2001 From: Marcio Mazza Date: Sun, 21 Jun 2015 12:38:33 -0300 Subject: [PATCH] Start label matching from original forms html --- legacy/scripts/extract_labels.py | 34 +- .../original_forms/MateriaLegislativa.html | 952 ++++++++++++++++++ 2 files changed, 985 insertions(+), 1 deletion(-) create mode 100644 legacy/scripts/original_forms/MateriaLegislativa.html diff --git a/legacy/scripts/extract_labels.py b/legacy/scripts/extract_labels.py index 42baacf1c..5d6e1a126 100644 --- a/legacy/scripts/extract_labels.py +++ b/legacy/scripts/extract_labels.py @@ -1,12 +1,20 @@ +import os import string -from bs4 import BeautifulSoup +from difflib import SequenceMatcher from itertools import chain +from bs4 import BeautifulSoup +from django.template.defaultfilters import slugify + +from materia.models import MateriaLegislativa + def _label_from_td(td): return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) +# TODO: improve, getting ids inputs +# TODO: improve, getting fieldsets def get_labels(filename, flat=True): """Extract labels from a file containg the html source of a rendered legacy sapl form @@ -26,3 +34,27 @@ def get_labels(filename, flat=True): else: return labels + +def similar(a, b): + return SequenceMatcher(None, a, b).ratio() > 0.6 + +model = MateriaLegislativa +filename = os.path.join(os.path.dirname(__file__), + 'original_forms/%s.html' % model.__name__) +labels = get_labels(filename) +slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels] +field_names = [f.name for f in model._meta.fields if f.name != 'id'] + +matches = {} + +while field_names: + percent, field, slug, label = sorted( + [(similar(a, slug), a, slug, label) + for a in field_names + for (slug, label) in slugs_to_labels])[-1] + if percent > 0.6: + matches[field] = (label, percent) + slugs_to_labels.remove((slug, label)) + else: + print 'Label not found for [%s]' % field + field_names.remove(field) diff --git a/legacy/scripts/original_forms/MateriaLegislativa.html b/legacy/scripts/original_forms/MateriaLegislativa.html new file mode 100644 index 000000000..241de62c8 --- /dev/null +++ b/legacy/scripts/original_forms/MateriaLegislativa.html @@ -0,0 +1,952 @@ + + + + + + + + + + + + + + Sistema de Apoio ao Processo Legislativo + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ + + +
+

+ Câmara Municipal de Demonstração + + - DF +

+

Sistema de Apoio ao Processo Legislativo

+
+
+
+ +
+ + + + + + + + + + + +Ajuda +

Matéria Legislativa

+ + + +
+ + + + + + +
+ Identificação Básica + + + + + + + + + + + + + + + + + + + +

+ + + +

+ + +

+ +

+ +

+ + +

+ + Oral + + Escrita +

+ + +
+
+ + Gerar ODT +
+
+
+ Proposição Eletrônica + + + + +
+ Esta matéria não foi gerada a partir de uma proposição eletrônica. +
+
+ + +
+ Outras Informações + + + + + + + + + + + + + + + + + +

+ +

+ +

+ + Sim + + Não +

+ +

+

+ + Sim + + Não +

+ +

+ +

+ + Sim + + Não +

+ +
+
+
+ Origem Externa + + + + + + + + + + +

+ +

+ +

+
+

+ +

+ +
+
+
+ Dados Textuais + + + + + + + + + + +
  + +
+ +
+ +
+
+ +

+ + +

+
+ +
+
+ +
+
+ + +
+
+
+ + + + + \ No newline at end of file