sapl/legacy/scripts/scrap_original_forms.py

# -*- coding: utf-8 -*-
import os
import re
import string

from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag

from field_renames import field_renames
from migration import appconfs
from utils import listify, getsourcelines


assert appconfs  # to prevent removal by automatic organize imports on this file


def _read_line(tr):
    for td in tr.find_all('td'):
        label = td.text.strip().split('\n')[0].strip('\xa0' + string.whitespace)
        if label.endswith('(*)'):
            label = label[:-3].strip()
        names = [c.attrs['name'] for c in td.findAll() if isinstance(c, Tag) and 'name' in c.attrs]
        if names:
            name = names[0].split('_', 1)[-1]
            yield name, label


def extract_title_and_fieldsets(model):
    filename = os.path.join(os.path.dirname(__file__),
                            'original_forms/%s.html' % model.__name__)
    try:
        with open(filename, 'r') as f:
            cont = f.read()
    except IOError:
        return None, []

    html_doc = cont.decode('utf-8')
    soup = BeautifulSoup(html_doc, 'html.parser')
    forms = soup.find_all('form')
    [form] = [f for f in forms if ('method', 'post') in f.attrs.items()]
    # children are either tags or strings...
    assert set(type(c) for c in form.children) == {Tag, NavigableString}
    # ... and all strings are empty
    assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))

    title = soup.find('h1', {'class': 'firstHeading'})
    title = title.text.strip() if title else None
    fieldsets = [dict(
        legend=fieldset.find('legend').text if fieldset.find('legend') else '',
        lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')])
        for fieldset in form.find_all('fieldset')]

    return title, fieldsets


def get_names_labels(fieldsets):
    for fieldset in fieldsets:
        for line in fieldset['lines']:
            for name, label in line:
                yield name, label


def print_title_and_fieldsets(model):
    title, fieldsets = extract_title_and_fieldsets(model)
    print('#### %s ####\n' % title)
    for fieldset in fieldsets:
        print(fieldset['legend'])
        for line in fieldset['lines']:
            print('  ' + ' | '.join('%s : %s' % (id, label) for id, label in line))


def extract_verbose_names(model):
    title, fieldsets = extract_title_and_fieldsets(model)
    names_to_labels = dict(get_names_labels(fieldsets))

    field_names = [f.name for f in model._meta.fields if f.name != 'id']

    labels = {}
    field_names_to_old = field_renames[model]
    for name in field_names:
        old_name = field_names_to_old[name]
        label = names_to_labels.get(old_name, None)
        if label:
            labels[name] = label
            del names_to_labels[old_name]
    for name, label in labels.items():
        field_names.remove(name)
    non_matched = field_names, names_to_labels
    return title, labels, non_matched


@listify
def source_with_verbose_names(model):
    source = getsourcelines(model)
    title, labels, non_matched = extract_verbose_names(model)

    field_regex = ' *(.+) = (models\.[^\(]*)\((.*verbose_name=_\(.*\)|.*)\)'
    new_lines = []
    class_meta_already_exists = False
    for line in source[1:]:
        for regex, split in [
                (field_regex + ' *# (.+)', lambda groups: groups),
                (field_regex, lambda groups: groups + ('',))]:
            match = re.match(regex, line)
            if match:
                name, path, args, legacy_name = split(match.groups())
                if name in labels and 'verbose_name' not in args:
                    args = [args] if args.strip() else []
                    args.append("verbose_name=_(u'%s')" % labels[name])
                    args = ', '.join(args)
                new_lines.append(
                    ('    %s = %s(%s)' % (name, path, args), legacy_name))
                break
        else:
            if 'class Meta:' in line:
                class_meta_already_exists = True
            new_lines.append((line, ''))
    yield source[0].rstrip()
    cols = max(map(len, [line for line, _ in new_lines]))
    for line, legacy_name in new_lines:
        line = line.rstrip().ljust(cols)
        if legacy_name:
            yield line + '  # ' + legacy_name
        else:
            yield line

    # class Meta
    if class_meta_already_exists:
        return

    if title == 'Tabelas Auxiliares':
        title = ''
    title = title if title else ''

    def add_s(name):
        return ' '.join(p if p.endswith('s') else p + 's' for p in name.split())

    def remove_s(name):
        return ' '.join(p[:-1] if p.endswith('s') else p for p in name.split())

    if not title:
        # default title from model name
        title_singular = ' '.join(re.findall('[A-Z][^A-Z]*', model.__name__))
        title_singular = re.sub('cao\\b', 'ção', title_singular)
        title_singular = re.sub('ao\\b', 'ão', title_singular)
        title_plural = add_s(title_singular.replace('ção', 'ções').replace('ão', 'ões'))

    elif title.endswith('s'):
        title_singular = remove_s(title.replace('ções', 'ção').replace('ões', 'ão'))
        title_plural = title
    else:
        title_singular = title
        title_plural = add_s(title.replace('ção', 'ções').replace('ão', 'ões'))

    yield """
    class Meta:
        verbose_name = _(u'%s')
        verbose_name_plural = _(u'%s')""" % (title_singular, title_plural)


def print_app_with_verbose_names(app):
    print('##################################################################')
    header = '# -*- coding: utf-8 -*-\n'
    for line in getsourcelines(app.models_module):
        if line in ['# -*- coding: utf-8 -*-',
                    'from django.utils.translation import ugettext as _', ]:
            continue
        elif line == 'from django.db import models':
            header += '''from django.db import models
from django.utils.translation import ugettext as _
'''
        elif 'class' in line:
            break
        else:
            header += line + '\n'
    print(header.strip())
    for model in app.models.values():
        print('\n')
        for p in source_with_verbose_names(model):
            print(p)


def list_models_with_no_scrapped_data(app):
    for model in app.models.values():
        if not any(extract_verbose_names(model)[:2]):
            print(model.__name__)
Add model verbose name guessing by model name 10 years ago			`# -- coding: utf-8 --`
Start label matching from original forms html 10 years ago			`import os`
Add source rewrite with scrapped verbose names 10 years ago			`import re`
Add script to extract labels from sapl forms 10 years ago			`import string`

Start label matching from original forms html 10 years ago			`from bs4 import BeautifulSoup`
Extract fieldsets from original forms html 10 years ago			`from bs4.element import NavigableString, Tag`
Start label matching from original forms html 10 years ago
Rename field_mappings -> field_renames 10 years ago			`from field_renames import field_renames`
Fix field id detection in scraping 10 years ago			`from migration import appconfs`
Add method to get source lines in unicode 10 years ago			`from utils import listify, getsourcelines`
Extract verbose names from original forms html 10 years ago
Add script to extract labels from sapl forms 10 years ago
Fix field id detection in scraping 10 years ago			`assert appconfs # to prevent removal by automatic organize imports on this file`


Extract verbose names from original forms html 10 years ago			`def _read_line(tr):`
			`for td in tr.find_all('td'):`
Port strings to python 3 10 years ago			`label = td.text.strip().split('\n')[0].strip('\xa0' + string.whitespace)`
Rewrite app models module with verbose names 10 years ago			`if label.endswith('(*)'):`
			`label = label[:-3].strip()`
Fix field id detection in scraping 10 years ago			`names = [c.attrs['name'] for c in td.findAll() if isinstance(c, Tag) and 'name' in c.attrs]`
Extract verbose names from original forms html 10 years ago			`if names:`
			`name = names[0].split('_', 1)[-1]`
			`yield name, label`
Add script to extract labels from sapl forms 10 years ago

Extract verbose names from original forms html 10 years ago			`def extract_title_and_fieldsets(model):`
			`filename = os.path.join(os.path.dirname(__file__),`
			`'original_forms/%s.html' % model.__name__)`
Adjust scraping code a little 10 years ago			`try:`
			`with open(filename, 'r') as f:`
			`cont = f.read()`
			`except IOError:`
			`return None, []`

Add script to extract labels from sapl forms 10 years ago			`html_doc = cont.decode('utf-8')`
			`soup = BeautifulSoup(html_doc, 'html.parser')`
			`forms = soup.find_all('form')`
Port strings to python 3 10 years ago			`[form] = [f for f in forms if ('method', 'post') in f.attrs.items()]`
Extract fieldsets from original forms html 10 years ago			`# children are either tags or strings...`
			`assert set(type(c) for c in form.children) == {Tag, NavigableString}`
			`# ... and all strings are empty`
			`assert all(not c.strip() for c in form.children if isinstance(c, NavigableString))`

Extract verbose names from original forms html 10 years ago			`title = soup.find('h1', {'class': 'firstHeading'})`
Add source rewrite with scrapped verbose names 10 years ago			`title = title.text.strip() if title else None`
Extract verbose names from original forms html 10 years ago			`fieldsets = [dict(`
Adjust scraping code a little 10 years ago			`legend=fieldset.find('legend').text if fieldset.find('legend') else '',`
Extract verbose names from original forms html 10 years ago			`lines=[list(_read_line(tr)) for tr in fieldset.find_all('tr')])`
			`for fieldset in form.find_all('fieldset')]`

			`return title, fieldsets`
Extract fieldsets from original forms html 10 years ago

Extract verbose names from original forms html 10 years ago			`def get_names_labels(fieldsets):`
Extract fieldsets from original forms html 10 years ago			`for fieldset in fieldsets:`
			`for line in fieldset['lines']:`
Extract verbose names from original forms html 10 years ago			`for name, label in line:`
			`yield name, label`
Extract fieldsets from original forms html 10 years ago

Extract verbose names from original forms html 10 years ago			`def print_title_and_fieldsets(model):`
			`title, fieldsets = extract_title_and_fieldsets(model)`
Port print statements to python 3 10 years ago			`print('#### %s ####\n' % title)`
Extract fieldsets from original forms html 10 years ago			`for fieldset in fieldsets:`
Port print statements to python 3 10 years ago			`print(fieldset['legend'])`
Extract fieldsets from original forms html 10 years ago			`for line in fieldset['lines']:`
Port print statements to python 3 10 years ago			`print(' ' + ' \| '.join('%s : %s' % (id, label) for id, label in line))`
Extract verbose names from original forms html 10 years ago

			`def extract_verbose_names(model):`
			`title, fieldsets = extract_title_and_fieldsets(model)`
			`names_to_labels = dict(get_names_labels(fieldsets))`

			`field_names = [f.name for f in model._meta.fields if f.name != 'id']`

Add source rewrite with scrapped verbose names 10 years ago			`labels = {}`
Rename field_mappings -> field_renames 10 years ago			`field_names_to_old = field_renames[model]`
Extract verbose names from original forms html 10 years ago			`for name in field_names:`
			`old_name = field_names_to_old[name]`
			`label = names_to_labels.get(old_name, None)`
			`if label:`
Add source rewrite with scrapped verbose names 10 years ago			`labels[name] = label`
Extract verbose names from original forms html 10 years ago			`del names_to_labels[old_name]`
Add source rewrite with scrapped verbose names 10 years ago			`for name, label in labels.items():`
Extract verbose names from original forms html 10 years ago			`field_names.remove(name)`
			`non_matched = field_names, names_to_labels`
Add source rewrite with scrapped verbose names 10 years ago			`return title, labels, non_matched`


			`@listify`
			`def source_with_verbose_names(model):`
Add method to get source lines in unicode 10 years ago			`source = getsourcelines(model)`
Add source rewrite with scrapped verbose names 10 years ago			`title, labels, non_matched = extract_verbose_names(model)`

Ignore already set verbose_name on source rewrite 10 years ago			`field_regex = ' (.+) = (models\.[^\(])\((.verbose_name=_\(.\)\|.*)\)'`
Add source rewrite with scrapped verbose names 10 years ago			`new_lines = []`
Test for previous class Meta on source generation 10 years ago			`class_meta_already_exists = False`
Add source rewrite with scrapped verbose names 10 years ago			`for line in source[1:]:`
			`for regex, split in [`
			`(field_regex + ' *# (.+)', lambda groups: groups),`
Fix field matching regex for source rewrite 10 years ago			`(field_regex, lambda groups: groups + ('',))]:`
Add source rewrite with scrapped verbose names 10 years ago			`match = re.match(regex, line)`
			`if match:`
			`name, path, args, legacy_name = split(match.groups())`
Rewrite app models module with verbose names 10 years ago			`if name in labels and 'verbose_name' not in args:`
Add source rewrite with scrapped verbose names 10 years ago			`args = [args] if args.strip() else []`
Port strings to python 3 10 years ago			`args.append("verbose_name=_(u'%s')" % labels[name])`
			`args = ', '.join(args)`
Add source rewrite with scrapped verbose names 10 years ago			`new_lines.append(`
Port strings to python 3 10 years ago			`(' %s = %s(%s)' % (name, path, args), legacy_name))`
Add source rewrite with scrapped verbose names 10 years ago			`break`
			`else:`
Test for previous class Meta on source generation 10 years ago			`if 'class Meta:' in line:`
			`class_meta_already_exists = True`
Add source rewrite with scrapped verbose names 10 years ago			`new_lines.append((line, ''))`
			`yield source[0].rstrip()`
			`cols = max(map(len, [line for line, _ in new_lines]))`
			`for line, legacy_name in new_lines:`
			`line = line.rstrip().ljust(cols)`
			`if legacy_name:`
Add double space before comments in source rewrite 10 years ago			`yield line + ' # ' + legacy_name`
Add source rewrite with scrapped verbose names 10 years ago			`else:`
			`yield line`
Test for previous class Meta on source generation 10 years ago
			`# class Meta`
			`if class_meta_already_exists:`
			`return`

Add model verbose name guessing by model name 10 years ago			`if title == 'Tabelas Auxiliares':`
			`title = ''`
Test for previous class Meta on source generation 10 years ago			`title = title if title else ''`
Add model verbose name guessing by model name 10 years ago
			`def add_s(name):`
			`return ' '.join(p if p.endswith('s') else p + 's' for p in name.split())`

			`def remove_s(name):`
			`return ' '.join(p[:-1] if p.endswith('s') else p for p in name.split())`

			`if not title:`
			`# default title from model name`
Port strings to python 3 10 years ago			`title_singular = ' '.join(re.findall('[A-Z][^A-Z]*', model.__name__))`
			`title_singular = re.sub('cao\\b', 'ção', title_singular)`
			`title_singular = re.sub('ao\\b', 'ão', title_singular)`
			`title_plural = add_s(title_singular.replace('ção', 'ções').replace('ão', 'ões'))`
Add model verbose name guessing by model name 10 years ago
			`elif title.endswith('s'):`
Port strings to python 3 10 years ago			`title_singular = remove_s(title.replace('ções', 'ção').replace('ões', 'ão'))`
Add model verbose name guessing by model name 10 years ago			`title_plural = title`
Test for previous class Meta on source generation 10 years ago			`else:`
Add model verbose name guessing by model name 10 years ago			`title_singular = title`
Port strings to python 3 10 years ago			`title_plural = add_s(title.replace('ção', 'ções').replace('ão', 'ões'))`
Add model verbose name guessing by model name 10 years ago
Test for previous class Meta on source generation 10 years ago			`yield """`
Add source rewrite with scrapped verbose names 10 years ago			`class Meta:`
			`verbose_name = _(u'%s')`
Rewrite app models module with verbose names 10 years ago			`verbose_name_plural = _(u'%s')""" % (title_singular, title_plural)`


			`def print_app_with_verbose_names(app):`
Port print statements to python 3 10 years ago			`print('##################################################################')`
Rewrite app models module with verbose names 10 years ago			`header = '# -- coding: utf-8 --\n'`
Add method to get source lines in unicode 10 years ago			`for line in getsourcelines(app.models_module):`
			`if line in ['# -- coding: utf-8 --',`
			`'from django.utils.translation import ugettext as _', ]:`
Rewrite app models module with verbose names 10 years ago			`continue`
Add method to get source lines in unicode 10 years ago			`elif line == 'from django.db import models':`
Rewrite app models module with verbose names 10 years ago			`header += '''from django.db import models`
			`from django.utils.translation import ugettext as _`
			`'''`
			`elif 'class' in line:`
			`break`
			`else:`
Adjust imports rewrite 10 years ago			`header += line + '\n'`
Port print statements to python 3 10 years ago			`print(header.strip())`
Rewrite app models module with verbose names 10 years ago			`for model in app.models.values():`
Port print statements to python 3 10 years ago			`print('\n')`
Rewrite app models module with verbose names 10 years ago			`for p in source_with_verbose_names(model):`
Port print statements to python 3 10 years ago			`print(p)`
List models without scrapped data 10 years ago

			`def list_models_with_no_scrapped_data(app):`
			`for model in app.models.values():`
			`if not any(extract_verbose_names(model)[:2]):`
Port print statements to python 3 10 years ago			`print(model.__name__)`