Extract verbose names from original forms html

10 years ago · 63a23304f2
1 changed files with 47 additions and 52 deletions
--- a/legacy/scripts/extract_labels.py
+++ b/legacy/scripts/extract_labels.py
@ -1,81 +1,76 @@
 import os
 import string
 from difflib import SequenceMatcher
 from itertools import chain
 from bs4 import BeautifulSoup
 from django.template.defaultfilters import slugify
 from materia.models import MateriaLegislativa
 from bs4.element import NavigableString, Tag
 from field_mappings import field_mappings
-def _label_from_td(td):
+def _read_line(tr):
-    return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
+    for td in tr.find_all('td'):
        label = td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
        names = [c.attrs['name'] for c in td.children if isinstance(c, Tag) and 'name' in c.attrs]
        if names:
            name = names[0].split('_', 1)[-1]
            yield name, label
-# TODO: improve, getting ids inputs
+def extract_title_and_fieldsets(model):
-# TODO: improve, getting fieldsets
+    filename = os.path.join(os.path.dirname(__file__),
-def get_fieldsets(filename):
+                            'original_forms/%s.html' % model.__name__)
    """Extract labels from a file containg the html source of a rendered
    legacy sapl form
    """
    with open(filename, 'r') as f:
        cont = f.read()
    html_doc = cont.decode('utf-8')
    soup = BeautifulSoup(html_doc, 'html.parser')
    forms = soup.find_all('form')
    [form] = [f for f in forms if (u'method', u'post') in f.attrs.items()]
    # children are either tags or strings...
    assert set(type(c) for c in form.children) == {Tag, NavigableString}
    # ... and all strings are empty
    assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))
-    for fieldset in form.find_all('fieldset'):
+    title = soup.find('h1', {'class': 'firstHeading'})
-        legend = fieldset.find('legend').text
+    title = title.text if title else None
-        yield dict(
+    fieldsets = [dict(
-            legend=legend,
+        legend=fieldset.find('legend').text,
-            lines=[[_label_from_td(td) for td in tr.find_all('td')]
+        lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')])
-                   for tr in fieldset.find_all('tr')]
+        for fieldset in form.find_all('fieldset')]
        )
    return title, fieldsets
-def get_labels(fieldsets):
+
 def get_names_labels(fieldsets):
    for fieldset in fieldsets:
        for line in fieldset['lines']:
-            for label in line:
+            for name, label in line:
-                yield label
+                yield name, label
-def print_fieldsets(fieldsets):
+def print_title_and_fieldsets(model):
    title, fieldsets = extract_title_and_fieldsets(model)
    print '#### %s ####\n' % title
    for fieldset in fieldsets:
        print fieldset['legend']
        for line in fieldset['lines']:
-            print '  ' + ', '.join(line)
+            print '  ' + ' | '.join('%s : %s' % (id, label) for id, label in line)
-def similar(a, b):
+def extract_verbose_names(model):
-    return SequenceMatcher(None, a, b).ratio() > 0.6
+    title, fieldsets = extract_title_and_fieldsets(model)
    names_to_labels = dict(get_names_labels(fieldsets))
 model = MateriaLegislativa
 filename = os.path.join(os.path.dirname(__file__),
                        'original_forms/%s.html' % model.__name__)
 fieldsets = list(get_fieldsets(filename))
 labels = get_labels(fieldsets)
 slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels]
    field_names = [f.name for f in model._meta.fields if f.name != 'id']
    matches = {}
-
+    field_names_to_old = field_mappings[model]
-while field_names:
+    for name in field_names:
-    percent, field, slug, label = sorted(
+        old_name = field_names_to_old[name]
-        [(similar(a, slug), a, slug, label)
+        label = names_to_labels.get(old_name, None)
-         for a in field_names
+        if label:
-         for (slug, label) in slugs_to_labels])[-1]
+            matches[name] = label
-    if percent > 0.6:
+            del names_to_labels[old_name]
-        matches[field] = (label, percent)
+    for name, label in matches.items():
-        slugs_to_labels.remove((slug, label))
+        field_names.remove(name)
-    else:
+    non_matched = field_names, names_to_labels
-        print 'Label not found for [%s]' % field
+    return title, matches, non_matched
    field_names.remove(field)