mirror of https://github.com/interlegis/sapl.git
Marcio Mazza
10 years ago
committed by
Marcio Mazza
1 changed files with 47 additions and 52 deletions
@ -1,81 +1,76 @@ |
|||||
import os |
import os |
||||
import string |
import string |
||||
from difflib import SequenceMatcher |
|
||||
from itertools import chain |
|
||||
|
|
||||
from bs4 import BeautifulSoup |
from bs4 import BeautifulSoup |
||||
from django.template.defaultfilters import slugify |
|
||||
|
|
||||
from materia.models import MateriaLegislativa |
|
||||
from bs4.element import NavigableString, Tag |
from bs4.element import NavigableString, Tag |
||||
|
|
||||
|
from field_mappings import field_mappings |
||||
|
|
||||
|
|
||||
def _label_from_td(td): |
def _read_line(tr): |
||||
return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) |
for td in tr.find_all('td'): |
||||
|
label = td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) |
||||
|
names = [c.attrs['name'] for c in td.children if isinstance(c, Tag) and 'name' in c.attrs] |
||||
|
if names: |
||||
|
name = names[0].split('_', 1)[-1] |
||||
|
yield name, label |
||||
|
|
||||
|
|
||||
# TODO: improve, getting ids inputs |
def extract_title_and_fieldsets(model): |
||||
# TODO: improve, getting fieldsets |
filename = os.path.join(os.path.dirname(__file__), |
||||
def get_fieldsets(filename): |
'original_forms/%s.html' % model.__name__) |
||||
"""Extract labels from a file containg the html source of a rendered |
|
||||
legacy sapl form |
|
||||
""" |
|
||||
with open(filename, 'r') as f: |
with open(filename, 'r') as f: |
||||
cont = f.read() |
cont = f.read() |
||||
html_doc = cont.decode('utf-8') |
html_doc = cont.decode('utf-8') |
||||
soup = BeautifulSoup(html_doc, 'html.parser') |
soup = BeautifulSoup(html_doc, 'html.parser') |
||||
forms = soup.find_all('form') |
forms = soup.find_all('form') |
||||
[form] = [f for f in forms if (u'method', u'post') in f.attrs.items()] |
[form] = [f for f in forms if (u'method', u'post') in f.attrs.items()] |
||||
|
|
||||
# children are either tags or strings... |
# children are either tags or strings... |
||||
assert set(type(c) for c in form.children) == {Tag, NavigableString} |
assert set(type(c) for c in form.children) == {Tag, NavigableString} |
||||
# ... and all strings are empty |
# ... and all strings are empty |
||||
assert all(not c.strip() for c in form.children if isinstance(c, NavigableString)) |
assert all(not c.strip() for c in form.children if isinstance(c, NavigableString)) |
||||
|
|
||||
for fieldset in form.find_all('fieldset'): |
title = soup.find('h1', {'class': 'firstHeading'}) |
||||
legend = fieldset.find('legend').text |
title = title.text if title else None |
||||
yield dict( |
fieldsets = [dict( |
||||
legend=legend, |
legend=fieldset.find('legend').text, |
||||
lines=[[_label_from_td(td) for td in tr.find_all('td')] |
lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')]) |
||||
for tr in fieldset.find_all('tr')] |
for fieldset in form.find_all('fieldset')] |
||||
) |
|
||||
|
return title, fieldsets |
||||
|
|
||||
|
|
||||
def get_labels(fieldsets): |
def get_names_labels(fieldsets): |
||||
for fieldset in fieldsets: |
for fieldset in fieldsets: |
||||
for line in fieldset['lines']: |
for line in fieldset['lines']: |
||||
for label in line: |
for name, label in line: |
||||
yield label |
yield name, label |
||||
|
|
||||
|
|
||||
def print_fieldsets(fieldsets): |
def print_title_and_fieldsets(model): |
||||
|
title, fieldsets = extract_title_and_fieldsets(model) |
||||
|
print '#### %s ####\n' % title |
||||
for fieldset in fieldsets: |
for fieldset in fieldsets: |
||||
print fieldset['legend'] |
print fieldset['legend'] |
||||
for line in fieldset['lines']: |
for line in fieldset['lines']: |
||||
print ' ' + ', '.join(line) |
print ' ' + ' | '.join('%s : %s' % (id, label) for id, label in line) |
||||
|
|
||||
|
|
||||
def similar(a, b): |
def extract_verbose_names(model): |
||||
return SequenceMatcher(None, a, b).ratio() > 0.6 |
title, fieldsets = extract_title_and_fieldsets(model) |
||||
|
names_to_labels = dict(get_names_labels(fieldsets)) |
||||
model = MateriaLegislativa |
|
||||
filename = os.path.join(os.path.dirname(__file__), |
field_names = [f.name for f in model._meta.fields if f.name != 'id'] |
||||
'original_forms/%s.html' % model.__name__) |
|
||||
fieldsets = list(get_fieldsets(filename)) |
matches = {} |
||||
labels = get_labels(fieldsets) |
field_names_to_old = field_mappings[model] |
||||
slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels] |
for name in field_names: |
||||
field_names = [f.name for f in model._meta.fields if f.name != 'id'] |
old_name = field_names_to_old[name] |
||||
|
label = names_to_labels.get(old_name, None) |
||||
matches = {} |
if label: |
||||
|
matches[name] = label |
||||
while field_names: |
del names_to_labels[old_name] |
||||
percent, field, slug, label = sorted( |
for name, label in matches.items(): |
||||
[(similar(a, slug), a, slug, label) |
field_names.remove(name) |
||||
for a in field_names |
non_matched = field_names, names_to_labels |
||||
for (slug, label) in slugs_to_labels])[-1] |
return title, matches, non_matched |
||||
if percent > 0.6: |
|
||||
matches[field] = (label, percent) |
|
||||
slugs_to_labels.remove((slug, label)) |
|
||||
else: |
|
||||
print 'Label not found for [%s]' % field |
|
||||
field_names.remove(field) |
|
||||
|
Loading…
Reference in new issue