Sistema de Apoio ao Processo Legislativo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

81 lines
2.5 KiB

import os
import string
from difflib import SequenceMatcher
from itertools import chain
from bs4 import BeautifulSoup
from django.template.defaultfilters import slugify
from materia.models import MateriaLegislativa
from bs4.element import NavigableString, Tag
def _label_from_td(td):
return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
# TODO: improve, getting ids inputs
# TODO: improve, getting fieldsets
def get_fieldsets(filename):
"""Extract labels from a file containg the html source of a rendered
legacy sapl form
"""
with open(filename, 'r') as f:
cont = f.read()
html_doc = cont.decode('utf-8')
soup = BeautifulSoup(html_doc, 'html.parser')
forms = soup.find_all('form')
[form] = [f for f in forms if (u'method', u'post') in f.attrs.items()]
# children are either tags or strings...
assert set(type(c) for c in form.children) == {Tag, NavigableString}
# ... and all strings are empty
assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))
for fieldset in form.find_all('fieldset'):
legend = fieldset.find('legend').text
yield dict(
legend=legend,
lines=[[_label_from_td(td) for td in tr.find_all('td')]
for tr in fieldset.find_all('tr')]
)
def get_labels(fieldsets):
for fieldset in fieldsets:
for line in fieldset['lines']:
for label in line:
yield label
def print_fieldsets(fieldsets):
for fieldset in fieldsets:
print fieldset['legend']
for line in fieldset['lines']:
print ' ' + ', '.join(line)
def similar(a, b):
return SequenceMatcher(None, a, b).ratio() > 0.6
model = MateriaLegislativa
filename = os.path.join(os.path.dirname(__file__),
'original_forms/%s.html' % model.__name__)
fieldsets = list(get_fieldsets(filename))
labels = get_labels(fieldsets)
slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels]
field_names = [f.name for f in model._meta.fields if f.name != 'id']
matches = {}
while field_names:
percent, field, slug, label = sorted(
[(similar(a, slug), a, slug, label)
for a in field_names
for (slug, label) in slugs_to_labels])[-1]
if percent > 0.6:
matches[field] = (label, percent)
slugs_to_labels.remove((slug, label))
else:
print 'Label not found for [%s]' % field
field_names.remove(field)