Browse Source

feat: add backfill_file_metadata management command (RFC §8)

Creates FileMetadata rows for all existing uploaded files that have a non-empty
file field but no _metadata FK yet.  Idempotent and safe to interrupt.

Features:
  --dry-run       report without writing
  --batch-size    rows per batch (default 500)
  --rate-limit    max rows/s for NFS-constrained deployments
  --skip-hash     stat-only (skip SHA-256 for very large instances)
  --app / --model restrict to a single model

For each qualifying row it:
  1. reads storage_name from the DB field value (relative path, unchanged)
  2. derives original_filename from the path's basename
  3. stat()s the file for file_size_bytes (fast, ~0.3 ms/file)
  4. optionally SHA-256 hashes it for content_hash (throughput-bound)
  5. creates a FileMetadata row and sets the _metadata FK

Missing files are warned and counted but do not abort the run, which is
expected on dev DBs and documents RFC §7 "broken reference detection".

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
file-metafields
Edward Ribeiro 2 weeks ago
parent
commit
f3f22c6bba
  1. 204
      sapl/base/management/commands/backfill_file_metadata.py

204
sapl/base/management/commands/backfill_file_metadata.py

@ -0,0 +1,204 @@
"""
Backfill FileMetadata rows for all existing uploaded files.
Run once after deploying MetadataFileField to production. Safe to interrupt
and re-run: processes only rows where the _metadata FK is NULL and the file
field is non-empty.
Usage:
python manage.py backfill_file_metadata
python manage.py backfill_file_metadata --batch-size=200 --rate-limit=20
python manage.py backfill_file_metadata --dry-run
python manage.py backfill_file_metadata --app materia --model materialegislativa
"""
import hashlib
import os
import time
from pathlib import Path
from django.apps import apps
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
from django.utils import timezone
# Every (app_label, model_name, field_name) that uses MetadataFileField.
# Kept in sync with SERVE_FILE_FIELDS in sapl/base/views.py.
METADATA_FILE_FIELDS = [
('materia', 'materialegislativa', 'texto_original'),
('materia', 'documentoacessorio', 'arquivo'),
('materia', 'proposicao', 'texto_original'),
('protocoloadm', 'documentoadministrativo', 'texto_integral'),
('protocoloadm', 'documentoacessorioadministrativo', 'arquivo'),
('norma', 'normajuridica', 'texto_integral'),
('norma', 'anexonormajuridica', 'anexo_arquivo'),
('comissoes', 'reuniao', 'upload_pauta'),
('comissoes', 'reuniao', 'upload_ata'),
('comissoes', 'reuniao', 'upload_anexo'),
('comissoes', 'documentoacessorio', 'arquivo'),
('audiencia', 'audienciapublica', 'upload_pauta'),
('audiencia', 'audienciapublica', 'upload_ata'),
('audiencia', 'audienciapublica', 'upload_anexo'),
('audiencia', 'anexoaudienciapublica', 'arquivo'),
('sessao', 'sessaoplenaria', 'upload_pauta'),
('sessao', 'sessaoplenaria', 'upload_ata'),
('sessao', 'sessaoplenaria', 'upload_anexo'),
('sessao', 'justificativaausencia', 'upload_anexo'),
('sessao', 'orador', 'upload_anexo'),
('sessao', 'oradorexpediente', 'upload_anexo'),
('sessao', 'oradorordemdia', 'upload_anexo'),
]
def _compute_hash(path):
h = hashlib.sha256()
with open(path, 'rb') as fh:
for chunk in iter(lambda: fh.read(65536), b''):
h.update(chunk)
return h.hexdigest()
class Command(BaseCommand):
help = (
'Backfill FileMetadata rows for all existing uploaded files. '
'Idempotent — skips rows that already have a _metadata FK set.'
)
def add_arguments(self, parser):
parser.add_argument(
'--batch-size', type=int, default=500,
help='Number of rows to process per batch (default: 500).',
)
parser.add_argument(
'--rate-limit', type=int, default=0,
help='Max rows per second (0 = unlimited; useful on NFS to avoid I/O storms).',
)
parser.add_argument(
'--dry-run', action='store_true',
help='Report what would be done without writing anything.',
)
parser.add_argument(
'--app', type=str, default=None,
help='Restrict to a single app_label.',
)
parser.add_argument(
'--model', type=str, default=None,
help='Restrict to a single model_name (requires --app).',
)
parser.add_argument(
'--skip-hash', action='store_true',
help='Skip content_hash computation (only stat for file_size_bytes).',
)
def handle(self, *args, **options):
from sapl.base.models import FileMetadata
batch_size = options['batch_size']
rate_limit = options['rate_limit']
dry_run = options['dry_run']
only_app = options['app']
only_model = options['model']
skip_hash = options['skip_hash']
if dry_run:
self.stdout.write(self.style.WARNING('DRY RUN — no changes will be written.'))
targets = [
(app, model, field)
for (app, model, field) in METADATA_FILE_FIELDS
if (only_app is None or app == only_app)
and (only_model is None or model == only_model)
]
total_created = 0
total_skipped = 0
total_errors = 0
for app_label, model_name, field_name in targets:
try:
Model = apps.get_model(app_label, model_name)
except LookupError:
self.stdout.write(
self.style.ERROR(f'Model {app_label}.{model_name} not found — skipping.'))
continue
meta_fk = f'{field_name}_metadata'
# Only process rows where file is set but metadata FK is NULL.
qs = Model.objects.filter(
**{f'{field_name}__isnull': False,
f'{meta_fk}__isnull': True}
).exclude(
**{field_name: ''}
).only('pk', field_name, meta_fk)
count = qs.count()
if count == 0:
self.stdout.write(
f'{app_label}.{model_name}.{field_name}: already up to date.')
continue
self.stdout.write(
f'{app_label}.{model_name}.{field_name}: {count} rows to backfill...')
batch_start = time.time()
processed = 0
for instance in qs.iterator(chunk_size=batch_size):
field_file = getattr(instance, field_name)
storage_name = field_file.name # relative path stored in DB
original_filename = Path(storage_name).name
full_path = os.path.join(settings.MEDIA_ROOT, storage_name)
file_size_bytes = None
content_hash = ''
try:
stat = os.stat(full_path)
file_size_bytes = stat.st_size
if not skip_hash:
content_hash = _compute_hash(full_path)
except OSError as e:
self.stdout.write(
self.style.WARNING(
f' pk={instance.pk}: cannot read {full_path}: {e}'))
total_errors += 1
if dry_run:
self.stdout.write(
f' [dry-run] pk={instance.pk}{storage_name}')
total_created += 1
processed += 1
continue
with transaction.atomic():
meta = FileMetadata(
storage_name=storage_name,
original_filename=original_filename,
file_size_bytes=file_size_bytes,
content_hash=content_hash,
backfilled_at=timezone.now(),
)
meta.save()
setattr(instance, f'{meta_fk}_id', meta.pk)
instance.save(update_fields=[f'{meta_fk}_id'])
total_created += 1
processed += 1
if rate_limit > 0:
elapsed = time.time() - batch_start
target_elapsed = processed / rate_limit
if target_elapsed > elapsed:
time.sleep(target_elapsed - elapsed)
self.stdout.write(
self.style.SUCCESS(
f' done: {processed} rows processed.'))
self.stdout.write('')
self.stdout.write(
f'Backfill complete — created: {total_created}, '
f'errors (file missing): {total_errors}.')
if dry_run:
self.stdout.write(self.style.WARNING('(dry run — nothing was written)'))
Loading…
Cancel
Save