Browse Source

Extract verbose names from original forms html

pull/6/head
Marcio Mazza 10 years ago
committed by Marcio Mazza
parent
commit
63a23304f2
  1. 87
      legacy/scripts/extract_labels.py

87
legacy/scripts/extract_labels.py

@ -1,81 +1,76 @@
import os import os
import string import string
from difflib import SequenceMatcher
from itertools import chain
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from django.template.defaultfilters import slugify
from materia.models import MateriaLegislativa
from bs4.element import NavigableString, Tag from bs4.element import NavigableString, Tag
from field_mappings import field_mappings
def _label_from_td(td): def _read_line(tr):
return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) for td in tr.find_all('td'):
label = td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
names = [c.attrs['name'] for c in td.children if isinstance(c, Tag) and 'name' in c.attrs]
if names:
name = names[0].split('_', 1)[-1]
yield name, label
# TODO: improve, getting ids inputs def extract_title_and_fieldsets(model):
# TODO: improve, getting fieldsets filename = os.path.join(os.path.dirname(__file__),
def get_fieldsets(filename): 'original_forms/%s.html' % model.__name__)
"""Extract labels from a file containg the html source of a rendered
legacy sapl form
"""
with open(filename, 'r') as f: with open(filename, 'r') as f:
cont = f.read() cont = f.read()
html_doc = cont.decode('utf-8') html_doc = cont.decode('utf-8')
soup = BeautifulSoup(html_doc, 'html.parser') soup = BeautifulSoup(html_doc, 'html.parser')
forms = soup.find_all('form') forms = soup.find_all('form')
[form] = [f for f in forms if (u'method', u'post') in f.attrs.items()] [form] = [f for f in forms if (u'method', u'post') in f.attrs.items()]
# children are either tags or strings... # children are either tags or strings...
assert set(type(c) for c in form.children) == {Tag, NavigableString} assert set(type(c) for c in form.children) == {Tag, NavigableString}
# ... and all strings are empty # ... and all strings are empty
assert all(not c.strip() for c in form.children if isinstance(c, NavigableString)) assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))
for fieldset in form.find_all('fieldset'): title = soup.find('h1', {'class': 'firstHeading'})
legend = fieldset.find('legend').text title = title.text if title else None
yield dict( fieldsets = [dict(
legend=legend, legend=fieldset.find('legend').text,
lines=[[_label_from_td(td) for td in tr.find_all('td')] lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')])
for tr in fieldset.find_all('tr')] for fieldset in form.find_all('fieldset')]
)
return title, fieldsets
def get_labels(fieldsets):
def get_names_labels(fieldsets):
for fieldset in fieldsets: for fieldset in fieldsets:
for line in fieldset['lines']: for line in fieldset['lines']:
for label in line: for name, label in line:
yield label yield name, label
def print_fieldsets(fieldsets): def print_title_and_fieldsets(model):
title, fieldsets = extract_title_and_fieldsets(model)
print '#### %s ####\n' % title
for fieldset in fieldsets: for fieldset in fieldsets:
print fieldset['legend'] print fieldset['legend']
for line in fieldset['lines']: for line in fieldset['lines']:
print ' ' + ', '.join(line) print ' ' + ' | '.join('%s : %s' % (id, label) for id, label in line)
def similar(a, b): def extract_verbose_names(model):
return SequenceMatcher(None, a, b).ratio() > 0.6 title, fieldsets = extract_title_and_fieldsets(model)
names_to_labels = dict(get_names_labels(fieldsets))
model = MateriaLegislativa
filename = os.path.join(os.path.dirname(__file__),
'original_forms/%s.html' % model.__name__)
fieldsets = list(get_fieldsets(filename))
labels = get_labels(fieldsets)
slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels]
field_names = [f.name for f in model._meta.fields if f.name != 'id'] field_names = [f.name for f in model._meta.fields if f.name != 'id']
matches = {} matches = {}
field_names_to_old = field_mappings[model]
while field_names: for name in field_names:
percent, field, slug, label = sorted( old_name = field_names_to_old[name]
[(similar(a, slug), a, slug, label) label = names_to_labels.get(old_name, None)
for a in field_names if label:
for (slug, label) in slugs_to_labels])[-1] matches[name] = label
if percent > 0.6: del names_to_labels[old_name]
matches[field] = (label, percent) for name, label in matches.items():
slugs_to_labels.remove((slug, label)) field_names.remove(name)
else: non_matched = field_names, names_to_labels
print 'Label not found for [%s]' % field return title, matches, non_matched
field_names.remove(field)

Loading…
Cancel
Save