sapl/legacy/scripts/extract_labels.py


								import os

								import string

								from difflib import SequenceMatcher

								from itertools import chain


								from bs4 import BeautifulSoup

								from django.template.defaultfilters import slugify


								from materia.models import MateriaLegislativa

								from bs4.element import NavigableString, Tag


								def _label_from_td(td):

								    return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)


								# TODO: improve, getting ids inputs

								# TODO: improve, getting fieldsets

								def get_fieldsets(filename):

								    """Extract labels from a file containg the html source of a rendered

								    legacy sapl form

								    """

								    with open(filename, 'r') as f:

								        cont = f.read()

								    html_doc = cont.decode('utf-8')

								    soup = BeautifulSoup(html_doc, 'html.parser')

								    forms = soup.find_all('form')

								    [form] = [f for f in forms if (u'method', u'post') in f.attrs.items()]


								    # children are either tags or strings...

								    assert set(type(c) for c in form.children) == {Tag, NavigableString}

								    # ... and all strings are empty

								    assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))


								    for fieldset in form.find_all('fieldset'):

								        legend = fieldset.find('legend').text

								        yield dict(

								            legend=legend,

								            lines=[[_label_from_td(td) for td in tr.find_all('td')]

								                   for tr in fieldset.find_all('tr')]

								        )


								def get_labels(fieldsets):

								    for fieldset in fieldsets:

								        for line in fieldset['lines']:

								            for label in line:

								                yield label


								def print_fieldsets(fieldsets):

								    for fieldset in fieldsets:

								        print fieldset['legend']

								        for line in fieldset['lines']:

								            print '  ' + ', '.join(line)


								def similar(a, b):

								    return SequenceMatcher(None, a, b).ratio() > 0.6


								model = MateriaLegislativa

								filename = os.path.join(os.path.dirname(__file__),

								                        'original_forms/%s.html' % model.__name__)

								fieldsets = list(get_fieldsets(filename))

								labels = get_labels(fieldsets)

								slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels]

								field_names = [f.name for f in model._meta.fields if f.name != 'id']


								matches = {}


								while field_names:

								    percent, field, slug, label = sorted(

								        [(similar(a, slug), a, slug, label)

								         for a in field_names

								         for (slug, label) in slugs_to_labels])[-1]

								    if percent > 0.6:

								        matches[field] = (label, percent)

								        slugs_to_labels.remove((slug, label))

								    else:

								        print 'Label not found for [%s]' % field

								    field_names.remove(field)