Sistema de Apoio ao Processo Legislativo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

60 lines
1.8 KiB

import os
import string
from difflib import SequenceMatcher
from itertools import chain
from bs4 import BeautifulSoup
from django.template.defaultfilters import slugify
from materia.models import MateriaLegislativa
def _label_from_td(td):
return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
# TODO: improve, getting ids inputs
# TODO: improve, getting fieldsets
def get_labels(filename, flat=True):
"""Extract labels from a file containg the html source of a rendered
legacy sapl form
"""
with open(filename, 'r') as f:
cont = f.read()
html_doc = cont.decode('utf-8')
soup = BeautifulSoup(html_doc, 'html.parser')
forms = soup.find_all('form')
[form] = [f for f in forms if (u'method', u'post') in f.attrs.items()]
labels = [[_label_from_td(td) for td in tr.find_all('td')] for tr in form.find_all('tr')]
for line in labels:
print ', '.join("u'%s'" % l for l in line)
if flat:
return list(chain(*labels))
else:
return labels
def similar(a, b):
return SequenceMatcher(None, a, b).ratio() > 0.6
model = MateriaLegislativa
filename = os.path.join(os.path.dirname(__file__),
'original_forms/%s.html' % model.__name__)
labels = get_labels(filename)
slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels]
field_names = [f.name for f in model._meta.fields if f.name != 'id']
matches = {}
while field_names:
percent, field, slug, label = sorted(
[(similar(a, slug), a, slug, label)
for a in field_names
for (slug, label) in slugs_to_labels])[-1]
if percent > 0.6:
matches[field] = (label, percent)
slugs_to_labels.remove((slug, label))
else:
print 'Label not found for [%s]' % field
field_names.remove(field)