mirror of https://github.com/interlegis/sapl.git
Browse Source
Creates FileMetadata rows for all existing uploaded files that have a non-empty file field but no _metadata FK yet. Idempotent and safe to interrupt. Features: --dry-run report without writing --batch-size rows per batch (default 500) --rate-limit max rows/s for NFS-constrained deployments --skip-hash stat-only (skip SHA-256 for very large instances) --app / --model restrict to a single model For each qualifying row it: 1. reads storage_name from the DB field value (relative path, unchanged) 2. derives original_filename from the path's basename 3. stat()s the file for file_size_bytes (fast, ~0.3 ms/file) 4. optionally SHA-256 hashes it for content_hash (throughput-bound) 5. creates a FileMetadata row and sets the _metadata FK Missing files are warned and counted but do not abort the run, which is expected on dev DBs and documents RFC §7 "broken reference detection". Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>file-metafields
1 changed files with 204 additions and 0 deletions
@ -0,0 +1,204 @@ |
|||
""" |
|||
Backfill FileMetadata rows for all existing uploaded files. |
|||
|
|||
Run once after deploying MetadataFileField to production. Safe to interrupt |
|||
and re-run: processes only rows where the _metadata FK is NULL and the file |
|||
field is non-empty. |
|||
|
|||
Usage: |
|||
python manage.py backfill_file_metadata |
|||
python manage.py backfill_file_metadata --batch-size=200 --rate-limit=20 |
|||
python manage.py backfill_file_metadata --dry-run |
|||
python manage.py backfill_file_metadata --app materia --model materialegislativa |
|||
""" |
|||
import hashlib |
|||
import os |
|||
import time |
|||
from pathlib import Path |
|||
|
|||
from django.apps import apps |
|||
from django.conf import settings |
|||
from django.core.management.base import BaseCommand |
|||
from django.db import transaction |
|||
from django.utils import timezone |
|||
|
|||
# Every (app_label, model_name, field_name) that uses MetadataFileField. |
|||
# Kept in sync with SERVE_FILE_FIELDS in sapl/base/views.py. |
|||
METADATA_FILE_FIELDS = [ |
|||
('materia', 'materialegislativa', 'texto_original'), |
|||
('materia', 'documentoacessorio', 'arquivo'), |
|||
('materia', 'proposicao', 'texto_original'), |
|||
('protocoloadm', 'documentoadministrativo', 'texto_integral'), |
|||
('protocoloadm', 'documentoacessorioadministrativo', 'arquivo'), |
|||
('norma', 'normajuridica', 'texto_integral'), |
|||
('norma', 'anexonormajuridica', 'anexo_arquivo'), |
|||
('comissoes', 'reuniao', 'upload_pauta'), |
|||
('comissoes', 'reuniao', 'upload_ata'), |
|||
('comissoes', 'reuniao', 'upload_anexo'), |
|||
('comissoes', 'documentoacessorio', 'arquivo'), |
|||
('audiencia', 'audienciapublica', 'upload_pauta'), |
|||
('audiencia', 'audienciapublica', 'upload_ata'), |
|||
('audiencia', 'audienciapublica', 'upload_anexo'), |
|||
('audiencia', 'anexoaudienciapublica', 'arquivo'), |
|||
('sessao', 'sessaoplenaria', 'upload_pauta'), |
|||
('sessao', 'sessaoplenaria', 'upload_ata'), |
|||
('sessao', 'sessaoplenaria', 'upload_anexo'), |
|||
('sessao', 'justificativaausencia', 'upload_anexo'), |
|||
('sessao', 'orador', 'upload_anexo'), |
|||
('sessao', 'oradorexpediente', 'upload_anexo'), |
|||
('sessao', 'oradorordemdia', 'upload_anexo'), |
|||
] |
|||
|
|||
|
|||
def _compute_hash(path): |
|||
h = hashlib.sha256() |
|||
with open(path, 'rb') as fh: |
|||
for chunk in iter(lambda: fh.read(65536), b''): |
|||
h.update(chunk) |
|||
return h.hexdigest() |
|||
|
|||
|
|||
class Command(BaseCommand): |
|||
help = ( |
|||
'Backfill FileMetadata rows for all existing uploaded files. ' |
|||
'Idempotent — skips rows that already have a _metadata FK set.' |
|||
) |
|||
|
|||
def add_arguments(self, parser): |
|||
parser.add_argument( |
|||
'--batch-size', type=int, default=500, |
|||
help='Number of rows to process per batch (default: 500).', |
|||
) |
|||
parser.add_argument( |
|||
'--rate-limit', type=int, default=0, |
|||
help='Max rows per second (0 = unlimited; useful on NFS to avoid I/O storms).', |
|||
) |
|||
parser.add_argument( |
|||
'--dry-run', action='store_true', |
|||
help='Report what would be done without writing anything.', |
|||
) |
|||
parser.add_argument( |
|||
'--app', type=str, default=None, |
|||
help='Restrict to a single app_label.', |
|||
) |
|||
parser.add_argument( |
|||
'--model', type=str, default=None, |
|||
help='Restrict to a single model_name (requires --app).', |
|||
) |
|||
parser.add_argument( |
|||
'--skip-hash', action='store_true', |
|||
help='Skip content_hash computation (only stat for file_size_bytes).', |
|||
) |
|||
|
|||
def handle(self, *args, **options): |
|||
from sapl.base.models import FileMetadata |
|||
|
|||
batch_size = options['batch_size'] |
|||
rate_limit = options['rate_limit'] |
|||
dry_run = options['dry_run'] |
|||
only_app = options['app'] |
|||
only_model = options['model'] |
|||
skip_hash = options['skip_hash'] |
|||
|
|||
if dry_run: |
|||
self.stdout.write(self.style.WARNING('DRY RUN — no changes will be written.')) |
|||
|
|||
targets = [ |
|||
(app, model, field) |
|||
for (app, model, field) in METADATA_FILE_FIELDS |
|||
if (only_app is None or app == only_app) |
|||
and (only_model is None or model == only_model) |
|||
] |
|||
|
|||
total_created = 0 |
|||
total_skipped = 0 |
|||
total_errors = 0 |
|||
|
|||
for app_label, model_name, field_name in targets: |
|||
try: |
|||
Model = apps.get_model(app_label, model_name) |
|||
except LookupError: |
|||
self.stdout.write( |
|||
self.style.ERROR(f'Model {app_label}.{model_name} not found — skipping.')) |
|||
continue |
|||
|
|||
meta_fk = f'{field_name}_metadata' |
|||
# Only process rows where file is set but metadata FK is NULL. |
|||
qs = Model.objects.filter( |
|||
**{f'{field_name}__isnull': False, |
|||
f'{meta_fk}__isnull': True} |
|||
).exclude( |
|||
**{field_name: ''} |
|||
).only('pk', field_name, meta_fk) |
|||
|
|||
count = qs.count() |
|||
if count == 0: |
|||
self.stdout.write( |
|||
f'{app_label}.{model_name}.{field_name}: already up to date.') |
|||
continue |
|||
|
|||
self.stdout.write( |
|||
f'{app_label}.{model_name}.{field_name}: {count} rows to backfill...') |
|||
|
|||
batch_start = time.time() |
|||
processed = 0 |
|||
|
|||
for instance in qs.iterator(chunk_size=batch_size): |
|||
field_file = getattr(instance, field_name) |
|||
storage_name = field_file.name # relative path stored in DB |
|||
original_filename = Path(storage_name).name |
|||
|
|||
full_path = os.path.join(settings.MEDIA_ROOT, storage_name) |
|||
|
|||
file_size_bytes = None |
|||
content_hash = '' |
|||
|
|||
try: |
|||
stat = os.stat(full_path) |
|||
file_size_bytes = stat.st_size |
|||
if not skip_hash: |
|||
content_hash = _compute_hash(full_path) |
|||
except OSError as e: |
|||
self.stdout.write( |
|||
self.style.WARNING( |
|||
f' pk={instance.pk}: cannot read {full_path}: {e}')) |
|||
total_errors += 1 |
|||
|
|||
if dry_run: |
|||
self.stdout.write( |
|||
f' [dry-run] pk={instance.pk} → {storage_name}') |
|||
total_created += 1 |
|||
processed += 1 |
|||
continue |
|||
|
|||
with transaction.atomic(): |
|||
meta = FileMetadata( |
|||
storage_name=storage_name, |
|||
original_filename=original_filename, |
|||
file_size_bytes=file_size_bytes, |
|||
content_hash=content_hash, |
|||
backfilled_at=timezone.now(), |
|||
) |
|||
meta.save() |
|||
setattr(instance, f'{meta_fk}_id', meta.pk) |
|||
instance.save(update_fields=[f'{meta_fk}_id']) |
|||
|
|||
total_created += 1 |
|||
processed += 1 |
|||
|
|||
if rate_limit > 0: |
|||
elapsed = time.time() - batch_start |
|||
target_elapsed = processed / rate_limit |
|||
if target_elapsed > elapsed: |
|||
time.sleep(target_elapsed - elapsed) |
|||
|
|||
self.stdout.write( |
|||
self.style.SUCCESS( |
|||
f' done: {processed} rows processed.')) |
|||
|
|||
self.stdout.write('') |
|||
self.stdout.write( |
|||
f'Backfill complete — created: {total_created}, ' |
|||
f'errors (file missing): {total_errors}.') |
|||
if dry_run: |
|||
self.stdout.write(self.style.WARNING('(dry run — nothing was written)')) |
|||
Loading…
Reference in new issue