From 63a23304f29bba899534b649277cfaa80b02c12e Mon Sep 17 00:00:00 2001
From: Marcio Mazza <marciomazza@gmail.com>
Date: Sun, 21 Jun 2015 21:18:25 -0300
Subject: [PATCH] Extract verbose names from original forms html

---
 legacy/scripts/extract_labels.py | 99 +++++++++++++++-----------------
 1 file changed, 47 insertions(+), 52 deletions(-)

diff --git a/legacy/scripts/extract_labels.py b/legacy/scripts/extract_labels.py
index a82c060e3..60bb18181 100644
--- a/legacy/scripts/extract_labels.py
+++ b/legacy/scripts/extract_labels.py
@@ -1,81 +1,76 @@
 import os
 import string
-from difflib import SequenceMatcher
-from itertools import chain
 
 from bs4 import BeautifulSoup
-from django.template.defaultfilters import slugify
-
-from materia.models import MateriaLegislativa
 from bs4.element import NavigableString, Tag
 
+from field_mappings import field_mappings
+
 
-def _label_from_td(td):
-    return td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
+def _read_line(tr):
+    for td in tr.find_all('td'):
+        label = td.text.strip().split('\n')[0].strip(u'\xa0' + string.whitespace)
+        names = [c.attrs['name'] for c in td.children if isinstance(c, Tag) and 'name' in c.attrs]
+        if names:
+            name = names[0].split('_', 1)[-1]
+            yield name, label
 
 
-# TODO: improve, getting ids inputs
-# TODO: improve, getting fieldsets
-def get_fieldsets(filename):
-    """Extract labels from a file containg the html source of a rendered
-    legacy sapl form
-    """
+def extract_title_and_fieldsets(model):
+    filename = os.path.join(os.path.dirname(__file__),
+                            'original_forms/%s.html' % model.__name__)
     with open(filename, 'r') as f:
         cont = f.read()
     html_doc = cont.decode('utf-8')
     soup = BeautifulSoup(html_doc, 'html.parser')
     forms = soup.find_all('form')
     [form] = [f for f in forms if (u'method', u'post') in f.attrs.items()]
-
     # children are either tags or strings...
     assert set(type(c) for c in form.children) == {Tag, NavigableString}
     # ... and all strings are empty
     assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))
 
-    for fieldset in form.find_all('fieldset'):
-        legend = fieldset.find('legend').text
-        yield dict(
-            legend=legend,
-            lines=[[_label_from_td(td) for td in tr.find_all('td')]
-                   for tr in fieldset.find_all('tr')]
-        )
+    title = soup.find('h1', {'class': 'firstHeading'})
+    title = title.text if title else None
+    fieldsets = [dict(
+        legend=fieldset.find('legend').text,
+        lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')])
+        for fieldset in form.find_all('fieldset')]
+
+    return title, fieldsets
 
 
-def get_labels(fieldsets):
+def get_names_labels(fieldsets):
     for fieldset in fieldsets:
         for line in fieldset['lines']:
-            for label in line:
-                yield label
+            for name, label in line:
+                yield name, label
 
 
-def print_fieldsets(fieldsets):
+def print_title_and_fieldsets(model):
+    title, fieldsets = extract_title_and_fieldsets(model)
+    print '#### %s ####\n' % title
     for fieldset in fieldsets:
         print fieldset['legend']
         for line in fieldset['lines']:
-            print '  ' + ', '.join(line)
-
-
-def similar(a, b):
-    return SequenceMatcher(None, a, b).ratio() > 0.6
-
-model = MateriaLegislativa
-filename = os.path.join(os.path.dirname(__file__),
-                        'original_forms/%s.html' % model.__name__)
-fieldsets = list(get_fieldsets(filename))
-labels = get_labels(fieldsets)
-slugs_to_labels = [(slugify(s.lower()).replace('-', '_'), s) for s in labels]
-field_names = [f.name for f in model._meta.fields if f.name != 'id']
-
-matches = {}
-
-while field_names:
-    percent, field, slug, label = sorted(
-        [(similar(a, slug), a, slug, label)
-         for a in field_names
-         for (slug, label) in slugs_to_labels])[-1]
-    if percent > 0.6:
-        matches[field] = (label, percent)
-        slugs_to_labels.remove((slug, label))
-    else:
-        print 'Label not found for [%s]' % field
-    field_names.remove(field)
+            print '  ' + ' | '.join('%s : %s' % (id, label) for id, label in line)
+
+
+def extract_verbose_names(model):
+    title, fieldsets = extract_title_and_fieldsets(model)
+    names_to_labels = dict(get_names_labels(fieldsets))
+
+    field_names = [f.name for f in model._meta.fields if f.name != 'id']
+
+    matches = {}
+    field_names_to_old = field_mappings[model]
+    for name in field_names:
+        old_name = field_names_to_old[name]
+        label = names_to_labels.get(old_name, None)
+        if label:
+            matches[name] = label
+            del names_to_labels[old_name]
+    for name, label in matches.items():
+        field_names.remove(name)
+    non_matched = field_names, names_to_labels
+    return title, matches, non_matched