mirror of https://github.com/interlegis/sapl.git
Marcio Mazza
10 years ago
committed by
Marcio Mazza
1 changed files with 47 additions and 52 deletions
@ -1,81 +1,76 @@ |
|||
import os |
|||
import string |
|||
from difflib import SequenceMatcher |
|||
from itertools import chain |
|||
|
|||
from bs4 import BeautifulSoup |
|||
from django.template.defaultfilters import slugify |
|||
|
|||
from materia.models import MateriaLegislativa |
|||
from bs4.element import NavigableString, Tag |
|||
|
|||
from field_mappings import field_mappings |
|||
|
|||
|
|||
def _label_from_td(td): |
|||
return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) |
|||
def _read_line(tr): |
|||
for td in tr.find_all('td'): |
|||
label = td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) |
|||
names = [c.attrs['name'] for c in td.children if isinstance(c, Tag) and 'name' in c.attrs] |
|||
if names: |
|||
name = names[0].split('_', 1)[-1] |
|||
yield name, label |
|||
|
|||
|
|||
# TODO: improve, getting ids inputs |
|||
# TODO: improve, getting fieldsets |
|||
def get_fieldsets(filename): |
|||
"""Extract labels from a file containg the html source of a rendered |
|||
legacy sapl form |
|||
""" |
|||
def extract_title_and_fieldsets(model): |
|||
filename = os.path.join(os.path.dirname(__file__), |
|||
'original_forms/%s.html' % model.__name__) |
|||
with open(filename, 'r') as f: |
|||
cont = f.read() |
|||
html_doc = cont.decode('utf-8') |
|||
soup = BeautifulSoup(html_doc, 'html.parser') |
|||
forms = soup.find_all('form') |
|||
[form] = [f for f in forms if (u'method', u'post') in f.attrs.items()] |
|||
|
|||
# children are either tags or strings... |
|||
assert set(type(c) for c in form.children) == {Tag, NavigableString} |
|||
# ... and all strings are empty |
|||
assert all(not c.strip() for c in form.children if isinstance(c, NavigableString)) |
|||
|
|||
for fieldset in form.find_all('fieldset'): |
|||
legend = fieldset.find('legend').text |
|||
yield dict( |
|||
legend=legend, |
|||
lines=[[_label_from_td(td) for td in tr.find_all('td')] |
|||
for tr in fieldset.find_all('tr')] |
|||
) |
|||
title = soup.find('h1', {'class': 'firstHeading'}) |
|||
title = title.text if title else None |
|||
fieldsets = [dict( |
|||
legend=fieldset.find('legend').text, |
|||
lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')]) |
|||
for fieldset in form.find_all('fieldset')] |
|||
|
|||
return title, fieldsets |
|||
|
|||
def get_labels(fieldsets): |
|||
|
|||
def get_names_labels(fieldsets): |
|||
for fieldset in fieldsets: |
|||
for line in fieldset['lines']: |
|||
for label in line: |
|||
yield label |
|||
for name, label in line: |
|||
yield name, label |
|||
|
|||
|
|||
def print_fieldsets(fieldsets): |
|||
def print_title_and_fieldsets(model): |
|||
title, fieldsets = extract_title_and_fieldsets(model) |
|||
print '#### %s ####\n' % title |
|||
for fieldset in fieldsets: |
|||
print fieldset['legend'] |
|||
for line in fieldset['lines']: |
|||
print ' ' + ', '.join(line) |
|||
print ' ' + ' | '.join('%s : %s' % (id, label) for id, label in line) |
|||
|
|||
|
|||
def similar(a, b): |
|||
return SequenceMatcher(None, a, b).ratio() > 0.6 |
|||
def extract_verbose_names(model): |
|||
title, fieldsets = extract_title_and_fieldsets(model) |
|||
names_to_labels = dict(get_names_labels(fieldsets)) |
|||
|
|||
model = MateriaLegislativa |
|||
filename = os.path.join(os.path.dirname(__file__), |
|||
'original_forms/%s.html' % model.__name__) |
|||
fieldsets = list(get_fieldsets(filename)) |
|||
labels = get_labels(fieldsets) |
|||
slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels] |
|||
field_names = [f.name for f in model._meta.fields if f.name != 'id'] |
|||
|
|||
matches = {} |
|||
|
|||
while field_names: |
|||
percent, field, slug, label = sorted( |
|||
[(similar(a, slug), a, slug, label) |
|||
for a in field_names |
|||
for (slug, label) in slugs_to_labels])[-1] |
|||
if percent > 0.6: |
|||
matches[field] = (label, percent) |
|||
slugs_to_labels.remove((slug, label)) |
|||
else: |
|||
print 'Label not found for [%s]' % field |
|||
field_names.remove(field) |
|||
field_names_to_old = field_mappings[model] |
|||
for name in field_names: |
|||
old_name = field_names_to_old[name] |
|||
label = names_to_labels.get(old_name, None) |
|||
if label: |
|||
matches[name] = label |
|||
del names_to_labels[old_name] |
|||
for name, label in matches.items(): |
|||
field_names.remove(name) |
|||
non_matched = field_names, names_to_labels |
|||
return title, matches, non_matched |
|||
|
Loading…
Reference in new issue