From 63a23304f29bba899534b649277cfaa80b02c12e Mon Sep 17 00:00:00 2001 From: Marcio Mazza Date: Sun, 21 Jun 2015 21:18:25 -0300 Subject: [PATCH] Extract verbose names from original forms html --- legacy/scripts/extract_labels.py | 99 +++++++++++++++----------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/legacy/scripts/extract_labels.py b/legacy/scripts/extract_labels.py index a82c060e3..60bb18181 100644 --- a/legacy/scripts/extract_labels.py +++ b/legacy/scripts/extract_labels.py @@ -1,81 +1,76 @@ import os import string -from difflib import SequenceMatcher -from itertools import chain from bs4 import BeautifulSoup -from django.template.defaultfilters import slugify - -from materia.models import MateriaLegislativa from bs4.element import NavigableString, Tag +from field_mappings import field_mappings + -def _label_from_td(td): - return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) +def _read_line(tr): + for td in tr.find_all('td'): + label = td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace) + names = [c.attrs['name'] for c in td.children if isinstance(c, Tag) and 'name' in c.attrs] + if names: + name = names[0].split('_', 1)[-1] + yield name, label -# TODO: improve, getting ids inputs -# TODO: improve, getting fieldsets -def get_fieldsets(filename): - """Extract labels from a file containg the html source of a rendered - legacy sapl form - """ +def extract_title_and_fieldsets(model): + filename = os.path.join(os.path.dirname(__file__), + 'original_forms/%s.html' % model.__name__) with open(filename, 'r') as f: cont = f.read() html_doc = cont.decode('utf-8') soup = BeautifulSoup(html_doc, 'html.parser') forms = soup.find_all('form') [form] = [f for f in forms if (u'method', u'post') in f.attrs.items()] - # children are either tags or strings... assert set(type(c) for c in form.children) == {Tag, NavigableString} # ... and all strings are empty assert all(not c.strip() for c in form.children if isinstance(c, NavigableString)) - for fieldset in form.find_all('fieldset'): - legend = fieldset.find('legend').text - yield dict( - legend=legend, - lines=[[_label_from_td(td) for td in tr.find_all('td')] - for tr in fieldset.find_all('tr')] - ) + title = soup.find('h1', {'class': 'firstHeading'}) + title = title.text if title else None + fieldsets = [dict( + legend=fieldset.find('legend').text, + lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')]) + for fieldset in form.find_all('fieldset')] + + return title, fieldsets -def get_labels(fieldsets): +def get_names_labels(fieldsets): for fieldset in fieldsets: for line in fieldset['lines']: - for label in line: - yield label + for name, label in line: + yield name, label -def print_fieldsets(fieldsets): +def print_title_and_fieldsets(model): + title, fieldsets = extract_title_and_fieldsets(model) + print '#### %s ####\n' % title for fieldset in fieldsets: print fieldset['legend'] for line in fieldset['lines']: - print ' ' + ', '.join(line) - - -def similar(a, b): - return SequenceMatcher(None, a, b).ratio() > 0.6 - -model = MateriaLegislativa -filename = os.path.join(os.path.dirname(__file__), - 'original_forms/%s.html' % model.__name__) -fieldsets = list(get_fieldsets(filename)) -labels = get_labels(fieldsets) -slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels] -field_names = [f.name for f in model._meta.fields if f.name != 'id'] - -matches = {} - -while field_names: - percent, field, slug, label = sorted( - [(similar(a, slug), a, slug, label) - for a in field_names - for (slug, label) in slugs_to_labels])[-1] - if percent > 0.6: - matches[field] = (label, percent) - slugs_to_labels.remove((slug, label)) - else: - print 'Label not found for [%s]' % field - field_names.remove(field) + print ' ' + ' | '.join('%s : %s' % (id, label) for id, label in line) + + +def extract_verbose_names(model): + title, fieldsets = extract_title_and_fieldsets(model) + names_to_labels = dict(get_names_labels(fieldsets)) + + field_names = [f.name for f in model._meta.fields if f.name != 'id'] + + matches = {} + field_names_to_old = field_mappings[model] + for name in field_names: + old_name = field_names_to_old[name] + label = names_to_labels.get(old_name, None) + if label: + matches[name] = label + del names_to_labels[old_name] + for name, label in matches.items(): + field_names.remove(name) + non_matched = field_names, names_to_labels + return title, matches, non_matched