Browse Source

Extract verbose names from original forms html

pull/6/head
Marcio Mazza 10 years ago
committed by Marcio Mazza
parent
commit
63a23304f2
  1. 87
      legacy/scripts/extract_labels.py

87
legacy/scripts/extract_labels.py

@ -1,81 +1,76 @@
import os
import string
from difflib import SequenceMatcher
from itertools import chain
from bs4 import BeautifulSoup
from django.template.defaultfilters import slugify
from materia.models import MateriaLegislativa
from bs4.element import NavigableString, Tag
from field_mappings import field_mappings
def _label_from_td(td):
return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
def _read_line(tr):
for td in tr.find_all('td'):
label = td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
names = [c.attrs['name'] for c in td.children if isinstance(c, Tag) and 'name' in c.attrs]
if names:
name = names[0].split('_', 1)[-1]
yield name, label
# TODO: improve, getting ids inputs
# TODO: improve, getting fieldsets
def get_fieldsets(filename):
"""Extract labels from a file containg the html source of a rendered
legacy sapl form
"""
def extract_title_and_fieldsets(model):
filename = os.path.join(os.path.dirname(__file__),
'original_forms/%s.html' % model.__name__)
with open(filename, 'r') as f:
cont = f.read()
html_doc = cont.decode('utf-8')
soup = BeautifulSoup(html_doc, 'html.parser')
forms = soup.find_all('form')
[form] = [f for f in forms if (u'method', u'post') in f.attrs.items()]
# children are either tags or strings...
assert set(type(c) for c in form.children) == {Tag, NavigableString}
# ... and all strings are empty
assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))
for fieldset in form.find_all('fieldset'):
legend = fieldset.find('legend').text
yield dict(
legend=legend,
lines=[[_label_from_td(td) for td in tr.find_all('td')]
for tr in fieldset.find_all('tr')]
)
title = soup.find('h1', {'class': 'firstHeading'})
title = title.text if title else None
fieldsets = [dict(
legend=fieldset.find('legend').text,
lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')])
for fieldset in form.find_all('fieldset')]
return title, fieldsets
def get_labels(fieldsets):
def get_names_labels(fieldsets):
for fieldset in fieldsets:
for line in fieldset['lines']:
for label in line:
yield label
for name, label in line:
yield name, label
def print_fieldsets(fieldsets):
def print_title_and_fieldsets(model):
title, fieldsets = extract_title_and_fieldsets(model)
print '#### %s ####\n' % title
for fieldset in fieldsets:
print fieldset['legend']
for line in fieldset['lines']:
print ' ' + ', '.join(line)
print ' ' + ' | '.join('%s : %s' % (id, label) for id, label in line)
def similar(a, b):
return SequenceMatcher(None, a, b).ratio() > 0.6
def extract_verbose_names(model):
title, fieldsets = extract_title_and_fieldsets(model)
names_to_labels = dict(get_names_labels(fieldsets))
model = MateriaLegislativa
filename = os.path.join(os.path.dirname(__file__),
'original_forms/%s.html' % model.__name__)
fieldsets = list(get_fieldsets(filename))
labels = get_labels(fieldsets)
slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels]
field_names = [f.name for f in model._meta.fields if f.name != 'id']
matches = {}
while field_names:
percent, field, slug, label = sorted(
[(similar(a, slug), a, slug, label)
for a in field_names
for (slug, label) in slugs_to_labels])[-1]
if percent > 0.6:
matches[field] = (label, percent)
slugs_to_labels.remove((slug, label))
else:
print 'Label not found for [%s]' % field
field_names.remove(field)
field_names_to_old = field_mappings[model]
for name in field_names:
old_name = field_names_to_old[name]
label = names_to_labels.get(old_name, None)
if label:
matches[name] = label
del names_to_labels[old_name]
for name, label in matches.items():
field_names.remove(name)
non_matched = field_names, names_to_labels
return title, matches, non_matched

Loading…
Cancel
Save