diff --git a/legacy/scripts/extract_labels.py b/legacy/scripts/extract_labels.py index 42baacf1c..5d6e1a126 100644 --- a/legacy/scripts/extract_labels.py +++ b/legacy/scripts/extract_labels.py @@ -1,12 +1,20 @@ +import os import string -from bs4 import BeautifulSoup +from difflib import SequenceMatcher from itertools import chain +from bs4 import BeautifulSoup +from django.template.defaultfilters import slugify + +from materia.models import MateriaLegislativa + def _label_from_td(td): return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) +# TODO: improve, getting ids inputs +# TODO: improve, getting fieldsets def get_labels(filename, flat=True): """Extract labels from a file containg the html source of a rendered legacy sapl form @@ -26,3 +34,27 @@ def get_labels(filename, flat=True): else: return labels + +def similar(a, b): + return SequenceMatcher(None, a, b).ratio() > 0.6 + +model = MateriaLegislativa +filename = os.path.join(os.path.dirname(__file__), + 'original_forms/%s.html' % model.__name__) +labels = get_labels(filename) +slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels] +field_names = [f.name for f in model._meta.fields if f.name != 'id'] + +matches = {} + +while field_names: + percent, field, slug, label = sorted( + [(similar(a, slug), a, slug, label) + for a in field_names + for (slug, label) in slugs_to_labels])[-1] + if percent > 0.6: + matches[field] = (label, percent) + slugs_to_labels.remove((slug, label)) + else: + print 'Label not found for [%s]' % field + field_names.remove(field) diff --git a/legacy/scripts/original_forms/MateriaLegislativa.html b/legacy/scripts/original_forms/MateriaLegislativa.html new file mode 100644 index 000000000..241de62c8 --- /dev/null +++ b/legacy/scripts/original_forms/MateriaLegislativa.html @@ -0,0 +1,952 @@ + + + + + + +
+ + + + + + +