From f3f22c6bbab2911024893ab883c7504e473c6cdb Mon Sep 17 00:00:00 2001 From: Edward Oliveira Date: Sat, 18 Apr 2026 16:11:10 -0300 Subject: [PATCH] =?UTF-8?q?feat:=20add=20backfill=5Ffile=5Fmetadata=20mana?= =?UTF-8?q?gement=20command=20(RFC=20=C2=A78)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creates FileMetadata rows for all existing uploaded files that have a non-empty file field but no _metadata FK yet. Idempotent and safe to interrupt. Features: --dry-run report without writing --batch-size rows per batch (default 500) --rate-limit max rows/s for NFS-constrained deployments --skip-hash stat-only (skip SHA-256 for very large instances) --app / --model restrict to a single model For each qualifying row it: 1. reads storage_name from the DB field value (relative path, unchanged) 2. derives original_filename from the path's basename 3. stat()s the file for file_size_bytes (fast, ~0.3 ms/file) 4. optionally SHA-256 hashes it for content_hash (throughput-bound) 5. creates a FileMetadata row and sets the _metadata FK Missing files are warned and counted but do not abort the run, which is expected on dev DBs and documents RFC §7 "broken reference detection". Co-Authored-By: Claude Sonnet 4.6 --- .../commands/backfill_file_metadata.py | 204 ++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 sapl/base/management/commands/backfill_file_metadata.py diff --git a/sapl/base/management/commands/backfill_file_metadata.py b/sapl/base/management/commands/backfill_file_metadata.py new file mode 100644 index 000000000..4cc7d5d62 --- /dev/null +++ b/sapl/base/management/commands/backfill_file_metadata.py @@ -0,0 +1,204 @@ +""" +Backfill FileMetadata rows for all existing uploaded files. + +Run once after deploying MetadataFileField to production. Safe to interrupt +and re-run: processes only rows where the _metadata FK is NULL and the file +field is non-empty. + +Usage: + python manage.py backfill_file_metadata + python manage.py backfill_file_metadata --batch-size=200 --rate-limit=20 + python manage.py backfill_file_metadata --dry-run + python manage.py backfill_file_metadata --app materia --model materialegislativa +""" +import hashlib +import os +import time +from pathlib import Path + +from django.apps import apps +from django.conf import settings +from django.core.management.base import BaseCommand +from django.db import transaction +from django.utils import timezone + +# Every (app_label, model_name, field_name) that uses MetadataFileField. +# Kept in sync with SERVE_FILE_FIELDS in sapl/base/views.py. +METADATA_FILE_FIELDS = [ + ('materia', 'materialegislativa', 'texto_original'), + ('materia', 'documentoacessorio', 'arquivo'), + ('materia', 'proposicao', 'texto_original'), + ('protocoloadm', 'documentoadministrativo', 'texto_integral'), + ('protocoloadm', 'documentoacessorioadministrativo', 'arquivo'), + ('norma', 'normajuridica', 'texto_integral'), + ('norma', 'anexonormajuridica', 'anexo_arquivo'), + ('comissoes', 'reuniao', 'upload_pauta'), + ('comissoes', 'reuniao', 'upload_ata'), + ('comissoes', 'reuniao', 'upload_anexo'), + ('comissoes', 'documentoacessorio', 'arquivo'), + ('audiencia', 'audienciapublica', 'upload_pauta'), + ('audiencia', 'audienciapublica', 'upload_ata'), + ('audiencia', 'audienciapublica', 'upload_anexo'), + ('audiencia', 'anexoaudienciapublica', 'arquivo'), + ('sessao', 'sessaoplenaria', 'upload_pauta'), + ('sessao', 'sessaoplenaria', 'upload_ata'), + ('sessao', 'sessaoplenaria', 'upload_anexo'), + ('sessao', 'justificativaausencia', 'upload_anexo'), + ('sessao', 'orador', 'upload_anexo'), + ('sessao', 'oradorexpediente', 'upload_anexo'), + ('sessao', 'oradorordemdia', 'upload_anexo'), +] + + +def _compute_hash(path): + h = hashlib.sha256() + with open(path, 'rb') as fh: + for chunk in iter(lambda: fh.read(65536), b''): + h.update(chunk) + return h.hexdigest() + + +class Command(BaseCommand): + help = ( + 'Backfill FileMetadata rows for all existing uploaded files. ' + 'Idempotent — skips rows that already have a _metadata FK set.' + ) + + def add_arguments(self, parser): + parser.add_argument( + '--batch-size', type=int, default=500, + help='Number of rows to process per batch (default: 500).', + ) + parser.add_argument( + '--rate-limit', type=int, default=0, + help='Max rows per second (0 = unlimited; useful on NFS to avoid I/O storms).', + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Report what would be done without writing anything.', + ) + parser.add_argument( + '--app', type=str, default=None, + help='Restrict to a single app_label.', + ) + parser.add_argument( + '--model', type=str, default=None, + help='Restrict to a single model_name (requires --app).', + ) + parser.add_argument( + '--skip-hash', action='store_true', + help='Skip content_hash computation (only stat for file_size_bytes).', + ) + + def handle(self, *args, **options): + from sapl.base.models import FileMetadata + + batch_size = options['batch_size'] + rate_limit = options['rate_limit'] + dry_run = options['dry_run'] + only_app = options['app'] + only_model = options['model'] + skip_hash = options['skip_hash'] + + if dry_run: + self.stdout.write(self.style.WARNING('DRY RUN — no changes will be written.')) + + targets = [ + (app, model, field) + for (app, model, field) in METADATA_FILE_FIELDS + if (only_app is None or app == only_app) + and (only_model is None or model == only_model) + ] + + total_created = 0 + total_skipped = 0 + total_errors = 0 + + for app_label, model_name, field_name in targets: + try: + Model = apps.get_model(app_label, model_name) + except LookupError: + self.stdout.write( + self.style.ERROR(f'Model {app_label}.{model_name} not found — skipping.')) + continue + + meta_fk = f'{field_name}_metadata' + # Only process rows where file is set but metadata FK is NULL. + qs = Model.objects.filter( + **{f'{field_name}__isnull': False, + f'{meta_fk}__isnull': True} + ).exclude( + **{field_name: ''} + ).only('pk', field_name, meta_fk) + + count = qs.count() + if count == 0: + self.stdout.write( + f'{app_label}.{model_name}.{field_name}: already up to date.') + continue + + self.stdout.write( + f'{app_label}.{model_name}.{field_name}: {count} rows to backfill...') + + batch_start = time.time() + processed = 0 + + for instance in qs.iterator(chunk_size=batch_size): + field_file = getattr(instance, field_name) + storage_name = field_file.name # relative path stored in DB + original_filename = Path(storage_name).name + + full_path = os.path.join(settings.MEDIA_ROOT, storage_name) + + file_size_bytes = None + content_hash = '' + + try: + stat = os.stat(full_path) + file_size_bytes = stat.st_size + if not skip_hash: + content_hash = _compute_hash(full_path) + except OSError as e: + self.stdout.write( + self.style.WARNING( + f' pk={instance.pk}: cannot read {full_path}: {e}')) + total_errors += 1 + + if dry_run: + self.stdout.write( + f' [dry-run] pk={instance.pk} → {storage_name}') + total_created += 1 + processed += 1 + continue + + with transaction.atomic(): + meta = FileMetadata( + storage_name=storage_name, + original_filename=original_filename, + file_size_bytes=file_size_bytes, + content_hash=content_hash, + backfilled_at=timezone.now(), + ) + meta.save() + setattr(instance, f'{meta_fk}_id', meta.pk) + instance.save(update_fields=[f'{meta_fk}_id']) + + total_created += 1 + processed += 1 + + if rate_limit > 0: + elapsed = time.time() - batch_start + target_elapsed = processed / rate_limit + if target_elapsed > elapsed: + time.sleep(target_elapsed - elapsed) + + self.stdout.write( + self.style.SUCCESS( + f' done: {processed} rows processed.')) + + self.stdout.write('') + self.stdout.write( + f'Backfill complete — created: {total_created}, ' + f'errors (file missing): {total_errors}.') + if dry_run: + self.stdout.write(self.style.WARNING('(dry run — nothing was written)'))