From cdca221f5876852900208b494dc7197d7c5c8cc6 Mon Sep 17 00:00:00 2001 From: Edward Oliveira Date: Sun, 19 Apr 2026 12:51:24 -0300 Subject: [PATCH] feat: add new fields to FileMetadata --- .gitignore | 2 + CLAUDE.md | 151 +++++++++ docker/docker-compose.yaml | 8 +- docker/startup_scripts/start.sh | 2 +- sapl/base/fields.py | 309 ++++++++++++------ .../commands/backfill_file_metadata.py | 281 +++++++++++----- .../commands/backfill_file_metadata_hashes.py | 111 +++++++ .../backfill_file_metadata_structural.py | 153 +++++++++ .../0062_filemetadata_owner_fields.py | 43 +++ sapl/base/models.py | 11 + sapl/base/views.py | 123 +++++-- 11 files changed, 975 insertions(+), 219 deletions(-) create mode 100644 CLAUDE.md create mode 100644 sapl/base/management/commands/backfill_file_metadata_hashes.py create mode 100644 sapl/base/management/commands/backfill_file_metadata_structural.py create mode 100644 sapl/base/migrations/0062_filemetadata_owner_fields.py diff --git a/.gitignore b/.gitignore index 0013e9542..6ce33c733 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,5 @@ media/* !media/.gitkeep restauracoes/* + +.claude/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..e2aa07539 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,151 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +SAPL (Sistema de Apoio ao Processo Legislativo) is a Django-based legislative management system used by Brazilian municipal and state legislative houses. It manages bills, parliamentary sessions, committees, norms, protocols, and related legislative workflows. + +## Design references + +@/Users/eribeiro/projects/sapl-docs/rfc/files-metafields-impl.md + + +## Commands + +### Development + +```bash +# Run dev server +python manage.py runserver + +# Docker (dev, without bundled DB) +docker-compose -f docker/docker-compose-dev.yml up + +# Docker (dev, with PostgreSQL container) +docker-compose -f docker/docker-compose-dev-db.yml up +``` + +### Database Setup (local PostgreSQL) + +```bash +sudo -u postgres psql -c "CREATE ROLE sapl LOGIN ENCRYPTED PASSWORD 'sapl' NOSUPERUSER INHERIT CREATEDB NOCREATEROLE NOREPLICATION;" +sudo -u postgres psql -c "CREATE DATABASE sapl WITH OWNER=sapl ENCODING='UTF8' LC_COLLATE='pt_BR.UTF-8' LC_CTYPE='pt_BR.UTF-8' CONNECTION LIMIT=-1 TEMPLATE template0;" +python manage.py migrate +``` + +### Testing + +```bash +# All tests (reuses DB by default for speed) +pytest + +# Single test file or test function +pytest sapl/materia/tests/test_materia.py +pytest sapl/materia/tests/test_materia.py::test_function_name + +# Force DB recreation +pytest --create-db + +# With coverage +pytest --cov=sapl +``` + +Tests require `DJANGO_SETTINGS_MODULE=sapl.settings` (set in `pytest.ini`). All tests must be marked with `@pytest.mark.django_db`. The `conftest.py` root fixture provides an `app` fixture (WebTest `DjangoTestApp`). + +### Linting / Formatting + +```bash +flake8 . +isort . +autopep8 --in-place +``` + +### Restore Database from Backup + +```bash +./scripts/restore_db.sh -f /path/to/dump +./scripts/restore_db.sh -f /path/to/dump -p 5433 # Docker port +``` + +## Architecture + +### Django Apps + +Apps are under `sapl/` and follow domain boundaries: + +| App | Domain | +|-----|--------| +| `base` | `CasaLegislativa` (legislative house config), `AppConfig`, `Autor` (authorship) | +| `parliamentary` | `Parlamentar`, `Legislatura`, `SessaoLegislativa`, `Coligacao` | +| `materia` | Bills (`MateriaLegislativa`), types, tracking, annexes | +| `norma` | Laws/norms (`NormaJuridica`) and hierarchies | +| `sessao` | Plenary sessions, agenda, attendance, voting | +| `comissoes` | Committees (`Comissao`) and meetings (`Reuniao`) | +| `protocoloadm` | Administrative protocols and document intake | +| `compilacao` | Structured/articulated texts (LexML-like tree structure) | +| `lexml` | LexML XML standard integration | +| `audiencia` | Public hearings | +| `painel` | Real-time session display panel | +| `relatorios` | PDF report generation | +| `api` | REST API entry point (auto-generated ViewSets) | +| `crud` | Generic CRUD base views | +| `rules` | Business rules and permission definitions | + +### REST API + +The API uses a custom `drfautoapi` package (`drfautoapi/drfautoapi.py`) that auto-generates DRF ViewSets, Serializers, and FilterSets from Django models. Authentication is Token + Session. Permissions use a custom `SaplModelPermissions` class that maps HTTP methods to Django model permissions. + +OpenAPI 3.0 docs are generated by drf-spectacular. + +### Caching + +- **Default:** File-based (`/var/tmp/django_cache`) +- **Production:** Redis via django-redis; configured at startup by `configure_redis_cache()` in `sapl/settings.py` +- **Cache key prefix:** `cache:{POD_NAMESPACE}:` (namespace-isolated for multi-tenant k8s) +- **Rate limiter state** is shared via Redis keys + +### Feature Flags + +django-waffle is used for feature flags. Switches (global on/off) can be toggled via: + +```bash +python manage.py waffle_switch on|off +``` + +### Key Environment Variables + +| Variable | Purpose | +|----------|---------| +| `DATABASE_URL` | PostgreSQL connection string | +| `SECRET_KEY` | Django secret key | +| `DEBUG` | Debug mode | +| `REDIS_URL` | Redis host:port | +| `CACHE_BACKEND` | `file` or `redis` | +| `POD_NAMESPACE` | K8s namespace (used in cache key prefix) | +| `USE_SOLR` | Enable Haystack/Solr full-text search | +| `SOLR_URL` / `SOLR_COLLECTION` | Solr connection | + +### Docker Build + +The production build requires a MaxMind GeoLite2-ASN license key (for nginx ASN-based bot blocking): + +```bash +docker build --secret id=maxmind_key,src=.env -f docker/Dockerfile -t sapl:local . +``` + +Optional build args: `WITH_NGINX`, `WITH_GRAPHVIZ`, `WITH_POPPLER`, `WITH_PSQL_CLIENT`. + +### Key File Locations + +| File | Purpose | +|------|---------| +| `sapl/settings.py` | All Django settings, including cache/rate-limit setup | +| `pytest.ini` | Test configuration (DJANGO_SETTINGS_MODULE, addopts) | +| `conftest.py` | Root pytest fixtures | +| `drfautoapi/drfautoapi.py` | Auto-API generation logic | +| `docker/startup_scripts/start.sh` | Container entrypoint (migrations, waffle, gunicorn) | +| `requirements/requirements.txt` | Production deps | +| `requirements/test-requirements.txt` | Test deps | +| `requirements/dev-requirements.txt` | Dev/lint deps | + diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index dc8559812..4d953f3fc 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -33,10 +33,10 @@ services: networks: - sapl-net sapl: - image: interlegis/sapl:3.1.165-RC2 -# build: -# context: ../ -# dockerfile: ./docker/Dockerfile +# image: interlegis/sapl:3.1.165-RC2 + build: + context: ../ + dockerfile: ./docker/Dockerfile container_name: sapl labels: NAME: "sapl" diff --git a/docker/startup_scripts/start.sh b/docker/startup_scripts/start.sh index 2bacd2c4a..6c26b90b8 100755 --- a/docker/startup_scripts/start.sh +++ b/docker/startup_scripts/start.sh @@ -273,7 +273,7 @@ main() { # deployed. Runs as a background job so pod startup is not delayed. # Must be after migrate_db so the base_file_metadata table exists. # Once all instances have been fully backfilled this line can be removed. - python3 manage.py backfill_file_metadata --rate-limit=20 & + python3 manage.py backfill_file_metadata_structural & configure_solr || true configure_sapn create_admin diff --git a/sapl/base/fields.py b/sapl/base/fields.py index cd1f57ad8..ccab45710 100644 --- a/sapl/base/fields.py +++ b/sapl/base/fields.py @@ -1,13 +1,24 @@ import hashlib +import logging import posixpath +import unicodedata from pathlib import Path +from urllib.parse import quote from uuid import uuid4 -from django.core.files import File +import magic +from django.core.exceptions import ValidationError from django.core.files.storage import default_storage -from django.db import models +from django.db import models, transaction from django.db.models.fields.files import FieldFile +from django.db.models.signals import post_save +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# MetadataFieldFile — FieldFile subclass with semantic URL support +# --------------------------------------------------------------------------- class MetadataFieldFile(FieldFile): """ @@ -28,8 +39,6 @@ class MetadataFieldFile(FieldFile): meta = getattr(self.instance, meta_attr, None) if meta and meta.original_filename: return meta.original_filename - # Fallback: basename of storage path (may be UUID for newly uploaded files - # whose metadata row hasn't been committed yet). return Path(self.name).name if self.name else '' @property @@ -41,16 +50,12 @@ class MetadataFieldFile(FieldFile): meta_attr = f'{self.field.name}_metadata' meta = getattr(instance, meta_attr, None) - # Fallback: no metadata row yet (pre-backfill existing file or first save - # before commit) → return the raw storage URL so nothing breaks. if meta is None: return self.storage.url(self.name) pk = getattr(instance, 'pk', None) if pk is not None: - # Saved instance — return the semantic alias. - # Lazy import avoids a circular dependency at module load time. from django.urls import reverse return reverse( 'serve_model_file', @@ -62,16 +67,112 @@ class MetadataFieldFile(FieldFile): }, ) - # Unsaved instance — return canonical UUID form. return f'/documentos/{meta.uuid}/' -def _compute_size_and_hash(field_file): - """ - Read the file content once to compute size and SHA-256 digest. - field_file must be open-able via field_file.open(). - Returns (size_in_bytes, hex_digest). - """ +# --------------------------------------------------------------------------- +# Filename sanitization +# --------------------------------------------------------------------------- + +def _sanitize_filename(name: str) -> str: + name = name.strip() + name = ''.join(c for c in name if ord(c) >= 0x20 and ord(c) != 0x7F) + name = unicodedata.normalize('NFC', name) + name = ' '.join(name.split()) + name = name[:255] + if not name: + name = 'untitled' + return name + + +# --------------------------------------------------------------------------- +# MIME validation +# --------------------------------------------------------------------------- + +FIELD_ALLOWED_TYPES = { + ('materia', 'materialegislativa', 'texto_original'): frozenset([ + 'application/pdf', 'application/msword', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/rtf', 'text/plain', + ]), + ('materia', 'documentoacessorio', 'arquivo'): frozenset([ + 'application/pdf', 'application/msword', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'image/jpeg', 'image/png', 'image/tiff', + ]), + ('materia', 'proposicao', 'texto_original'): frozenset([ + 'application/pdf', 'application/msword', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + ]), + ('protocoloadm', 'documentoadministrativo', 'texto_integral'): frozenset([ + 'application/pdf', + ]), + ('protocoloadm', 'documentoacessorioadministrativo', 'arquivo'): frozenset([ + 'application/pdf', 'image/jpeg', 'image/png', 'image/tiff', + ]), + ('norma', 'normajuridica', 'texto_integral'): frozenset([ + 'application/pdf', 'application/msword', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + ]), + ('norma', 'anexonormajuridica', 'anexo_arquivo'): frozenset([ + 'application/pdf', 'image/jpeg', 'image/png', + ]), + **{ + (app, model, field): frozenset(['application/pdf', 'image/jpeg', 'image/png', 'image/tiff']) + for app, model, field in [ + ('sessao', 'sessaoplenaria', 'upload_pauta'), + ('sessao', 'sessaoplenaria', 'upload_ata'), + ('sessao', 'sessaoplenaria', 'upload_anexo'), + ('sessao', 'justificativaausencia', 'upload_anexo'), + ('comissoes', 'reuniao', 'upload_pauta'), + ('comissoes', 'reuniao', 'upload_ata'), + ('comissoes', 'reuniao', 'upload_anexo'), + ('comissoes', 'documentoacessorio', 'arquivo'), + ('audiencia', 'audienciapublica', 'upload_pauta'), + ('audiencia', 'audienciapublica', 'upload_ata'), + ('audiencia', 'audienciapublica', 'upload_anexo'), + ('audiencia', 'anexoaudienciapublica', 'arquivo'), + ] + }, +} + +_MIME_TO_EXTENSIONS = { + 'application/pdf': {'.pdf'}, + 'application/msword': {'.doc'}, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': {'.docx'}, + 'application/rtf': {'.rtf'}, + 'text/plain': {'.txt'}, + 'image/jpeg': {'.jpg', '.jpeg'}, + 'image/png': {'.png'}, + 'image/tiff': {'.tif', '.tiff'}, +} + + +def _validate_file_type(file_obj, app_label, model_name, field_name): + allowed = FIELD_ALLOWED_TYPES.get((app_label, model_name, field_name)) + if allowed is None: + return + header = file_obj.file.read(512) + file_obj.file.seek(0) + sniffed = magic.from_buffer(header, mime=True) + if sniffed not in allowed: + raise ValidationError( + f'Tipo de arquivo não permitido: {sniffed}. ' + f'Permitidos: {", ".join(sorted(allowed))}' + ) + ext = Path(file_obj.name).suffix.lower() + if ext and sniffed in _MIME_TO_EXTENSIONS: + if ext not in _MIME_TO_EXTENSIONS[sniffed]: + raise ValidationError( + f'Extensão {ext!r} não corresponde ao tipo detectado ({sniffed}).' + ) + + +# --------------------------------------------------------------------------- +# Content hash + size +# --------------------------------------------------------------------------- + +def _compute_size_and_hash(field_file) -> tuple: h = hashlib.sha256() size = 0 with field_file.open('rb') as fh: @@ -81,37 +182,67 @@ def _compute_size_and_hash(field_file): return size, h.hexdigest() +# --------------------------------------------------------------------------- +# Blob deletion +# --------------------------------------------------------------------------- + +def _delete_blob_safe(storage_name: str) -> None: + try: + default_storage.delete(storage_name) + except Exception: + logger.warning('Failed to delete blob %s', storage_name, exc_info=True) + + +# --------------------------------------------------------------------------- +# Content-Disposition +# --------------------------------------------------------------------------- + +_INLINE_EXTENSIONS = frozenset(['.pdf']) + + +def _content_disposition(filename: str) -> str: + ext = Path(filename).suffix.lower() + disposition = 'inline' if ext in _INLINE_EXTENSIONS else 'attachment' + safe_ascii = filename.encode('ascii', 'replace').decode() + encoded = quote(filename, safe='') + return f'{disposition}; filename="{safe_ascii}"; filename*=UTF-8\'\'{encoded}' + + +# --------------------------------------------------------------------------- +# Visibility / cache helpers +# --------------------------------------------------------------------------- + +_PRIVATE_FIELDS = frozenset({ + ('materia', 'proposicao', 'texto_original'), + ('protocoloadm', 'documentoadministrativo', 'texto_integral'), + ('protocoloadm', 'documentoacessorioadministrativo', 'arquivo'), +}) + + +def _visibility(meta) -> str: + return 'private' if (meta.app_label, meta.model_name, meta.field_name) in _PRIVATE_FIELDS else 'public' + + +def _is_public(meta) -> bool: + return (meta.app_label, meta.model_name, meta.field_name) not in _PRIVATE_FIELDS + + +# --------------------------------------------------------------------------- +# MetadataFileField +# --------------------------------------------------------------------------- + class MetadataFileField(models.FileField): """ Drop-in replacement for models.FileField. - Uses MetadataFieldFile as its descriptor so that .url returns the semantic - alias /////download for saved instances, and falls - back to /documentos// for unsaved instances. - - In addition to normal FileField behaviour, this field: - 1. Injects a companion ForeignKey '_metadata' pointing to - base.FileMetadata on the owning model class at class-definition time. - 2. In pre_save, creates or updates the FileMetadata row that tracks the - stable uuid, storage_name, original_filename, size, and hash for the - uploaded file. - - Four lifecycle scenarios handled in pre_save: - Case 1 — first upload : create a new FileMetadata row; set the FK. - Case 2 — replacement : delete the old physical file; update the existing - FileMetadata row in-place so the uuid (and therefore - /documentos//) never changes. - Case 3 — field cleared : nullify the FK; delete the FileMetadata row; - physical file cleanup is deferred to the - clean_orphan_files management command. - Case 4 — no-op re-save : nothing touched. + Injects a companion FK '_metadata' → FileMetadata and maintains + it across upload, replacement, and clear operations. """ attr_class = MetadataFieldFile def contribute_to_class(self, cls, name): super().contribute_to_class(cls, name) - # Inject companion FK: e.g. texto_original → texto_original_metadata_id fk = models.ForeignKey( 'base.FileMetadata', null=True, @@ -121,39 +252,35 @@ class MetadataFileField(models.FileField): verbose_name='File metadata', ) cls.add_to_class(f'{name}_metadata', fk) + post_save.connect(self._fix_owner_pk, sender=cls, weak=False) + + def _fix_owner_pk(self, sender, instance, created, **kwargs): + """Fills owner_pk after INSERT — pk is None at pre_save time for new objects.""" + if not created: + return + meta = getattr(instance, f'{self.attname}_metadata', None) + if meta is not None and meta.owner_pk is None: + meta.owner_pk = instance.pk + meta.save(update_fields=['owner_pk']) def generate_filename(self, instance, filename): """ - Override: substitute a UUID for the filename in the upload_to path so - that newly uploaded files get stable, unguessable storage paths like - sapl/public/normajuridica/2025/9395/.pdf (RFC §6.3). - - For replacement uploads (Case 2) the existing meta UUID is reused so - the physical path — and therefore /documentos// — stays stable. - For first uploads (Case 1) a fresh UUID is generated and stashed on the - instance under _pending_uuid_ for pre_save to pick up when - creating the FileMetadata row. + Substitutes a UUID for the filename in the upload_to path so newly + uploaded files get stable, unguessable storage paths (RFC §6.3). """ - # 1. Let upload_to produce the directory + original filename. if callable(self.upload_to): upload_name = self.upload_to(instance, filename) else: upload_name = posixpath.join(self.upload_to, filename) - # 2. Determine the UUID to embed in the path. meta_attr = f'{self.name}_metadata' meta = getattr(instance, meta_attr, None) if meta is not None: - # Replacement (Case 2): reuse existing UUID — path stays identical, - # OverwriteStorage replaces the bytes in-place. file_uuid = str(meta.uuid) else: - # First upload (Case 1): generate a fresh UUID and stash it so - # pre_save can wire it into the new FileMetadata row. file_uuid = str(uuid4()) setattr(instance, f'_pending_uuid_{self.name}', file_uuid) - # 3. Replace the filename portion with . ext = Path(filename).suffix.lower() directory = posixpath.dirname(upload_name) new_name = posixpath.join(directory, f'{file_uuid}{ext}') if directory else f'{file_uuid}{ext}' @@ -167,78 +294,76 @@ class MetadataFileField(models.FileField): file_before = getattr(instance, self.attname) meta_before = getattr(instance, meta_attr, None) - # Capture intent BEFORE super() — storage.save() sets _committed=True, - # erasing the distinction between "new upload" and "already committed". has_new_upload = bool(file_before) and not getattr(file_before, '_committed', True) is_clearing = not file_before and meta_before is not None - # Capture browser-supplied filename before storage renames it to the UUID path. - # file_before.name is set to the original upload name by FileDescriptor.__get__ - # when it wraps the UploadedFile — more reliable than file_before.file.name - # which for TemporaryUploadedFile is the NamedTemporaryFile path. + # Sanitize original filename before storage renames it. + # file_before.name holds the user-supplied name at this point. if has_new_upload: - original_filename = Path(file_before.name).name if file_before.name else '' + original_filename = _sanitize_filename(Path(file_before.name).name) if file_before.name else 'untitled' else: original_filename = '' + if has_new_upload: + _validate_file_type(file_before, instance._meta.app_label, + instance._meta.model_name, self.attname) + file = super().pre_save(instance, add) - # file.name is now the UUID-based storage path from generate_filename, - # e.g. "sapl/public/normajuridica/2025/9395/.pdf" if is_clearing: - # Case 3: ClearableFileInput submitted with clear=True. - # Nullify FK on the in-memory instance immediately so the subsequent - # model.save() writes NULL — do NOT rely on SET_NULL cascade, which - # only fires in the DB. Physical file left for offline cleanup. + old_storage_name = meta_before.storage_name setattr(instance, f'{meta_attr}_id', None) meta_before.delete() + transaction.on_commit(lambda: _delete_blob_safe(old_storage_name)) elif file and has_new_upload: storage_name = file.name size, digest = _compute_size_and_hash(file) if meta_before is None: - # Case 1: first upload — create a new FileMetadata row. - # Use the UUID that generate_filename already baked into the path - # so that FileMetadata.uuid matches the on-disk filename. + # Case 1: first upload. pending_uuid = getattr(instance, f'_pending_uuid_{self.name}', None) meta_kwargs = dict( storage_name=storage_name, original_filename=original_filename, file_size_bytes=size, content_hash=digest, + app_label=instance._meta.app_label, + model_name=instance._meta.model_name, + field_name=self.attname, + owner_pk=instance.pk or None, ) if pending_uuid: from uuid import UUID meta_kwargs['uuid'] = UUID(pending_uuid) - # Clean up the temporary stash attribute. try: delattr(instance, f'_pending_uuid_{self.name}') except AttributeError: pass - meta = FileMetadata(**meta_kwargs) - meta.save() - setattr(instance, f'{meta_attr}_id', meta.pk) + with transaction.atomic(): + meta = FileMetadata(**meta_kwargs) + meta.save() + setattr(instance, f'{meta_attr}_id', meta.pk) else: - # Case 2: replacement — reuse the existing row so the stable uuid - # (and /documentos//) never changes. - # For UUID-based paths the old and new paths are identical - # (same uuid, same ext) — OverwriteStorage already replaced the - # bytes in-place, so we must NOT delete the newly written file. - if meta_before.storage_name != storage_name: - # Legacy (non-UUID) paths: old path differs → delete old file. - try: - default_storage.delete(meta_before.storage_name) - except OSError: - pass # already gone — proceed - meta_before.version += 1 - meta_before.storage_name = storage_name - meta_before.original_filename = original_filename - meta_before.file_size_bytes = size - meta_before.content_hash = digest - meta_before.save(update_fields=[ - 'version', 'storage_name', 'original_filename', - 'file_size_bytes', 'content_hash', - ]) + # Case 2: replacement — reuse existing row so uuid (and /documentos//) never changes. + with transaction.atomic(): + locked = FileMetadata.objects.select_for_update().get(pk=meta_before.pk) + old_storage_name = locked.storage_name + locked.version += 1 + locked.storage_name = storage_name + locked.original_filename = original_filename + locked.file_size_bytes = size + locked.content_hash = digest + locked.app_label = instance._meta.app_label + locked.model_name = instance._meta.model_name + locked.field_name = self.attname + locked.owner_pk = instance.pk + locked.save(update_fields=[ + 'version', 'storage_name', 'original_filename', + 'file_size_bytes', 'content_hash', + 'app_label', 'model_name', 'field_name', 'owner_pk', + ]) + if old_storage_name != storage_name: + transaction.on_commit(lambda: _delete_blob_safe(old_storage_name)) return file diff --git a/sapl/base/management/commands/backfill_file_metadata.py b/sapl/base/management/commands/backfill_file_metadata.py index 4cc7d5d62..79546110e 100644 --- a/sapl/base/management/commands/backfill_file_metadata.py +++ b/sapl/base/management/commands/backfill_file_metadata.py @@ -2,14 +2,24 @@ Backfill FileMetadata rows for all existing uploaded files. Run once after deploying MetadataFileField to production. Safe to interrupt -and re-run: processes only rows where the _metadata FK is NULL and the file -field is non-empty. +and re-run: idempotent in both phases. + +Phase 1 — creates missing FileMetadata rows for parent-model instances whose + _metadata FK is NULL (files uploaded before MetadataFileField was deployed). +Phase 2 — fills in content_hash / file_size_bytes for FileMetadata rows that + exist but are missing those lazy fields (RFC §8 query: + WHERE content_hash = '' OR file_size_bytes IS NULL). + +By default both phases run. Use --phase1 or --phase2 to run only one. Usage: - python manage.py backfill_file_metadata - python manage.py backfill_file_metadata --batch-size=200 --rate-limit=20 + python manage.py backfill_file_metadata # both phases + python manage.py backfill_file_metadata --phase1 # create missing rows only + python manage.py backfill_file_metadata --phase2 # fill size/hash only + python manage.py backfill_file_metadata --phase1 --rate-limit=20 + python manage.py backfill_file_metadata --phase1 --app materia --model materialegislativa + python manage.py backfill_file_metadata --phase2 --skip-hash python manage.py backfill_file_metadata --dry-run - python manage.py backfill_file_metadata --app materia --model materialegislativa """ import hashlib import os @@ -20,6 +30,7 @@ from django.apps import apps from django.conf import settings from django.core.management.base import BaseCommand from django.db import transaction +from django.db.models import Q from django.utils import timezone # Every (app_label, model_name, field_name) that uses MetadataFileField. @@ -61,7 +72,8 @@ def _compute_hash(path): class Command(BaseCommand): help = ( 'Backfill FileMetadata rows for all existing uploaded files. ' - 'Idempotent — skips rows that already have a _metadata FK set.' + 'Idempotent — Phase 1 creates missing rows, Phase 2 fills in ' + 'content_hash / file_size_bytes for incomplete rows (RFC §8).' ) def add_arguments(self, parser): @@ -79,16 +91,24 @@ class Command(BaseCommand): ) parser.add_argument( '--app', type=str, default=None, - help='Restrict to a single app_label.', + help='Restrict Phase 1 to a single app_label.', ) parser.add_argument( '--model', type=str, default=None, - help='Restrict to a single model_name (requires --app).', + help='Restrict Phase 1 to a single model_name (requires --app).', ) parser.add_argument( '--skip-hash', action='store_true', help='Skip content_hash computation (only stat for file_size_bytes).', ) + parser.add_argument( + '--phase1', action='store_true', + help='Run Phase 1 only (create missing FileMetadata rows).', + ) + parser.add_argument( + '--phase2', action='store_true', + help='Run Phase 2 only (fill in content_hash / file_size_bytes for incomplete rows).', + ) def handle(self, *args, **options): from sapl.base.models import FileMetadata @@ -99,106 +119,191 @@ class Command(BaseCommand): only_app = options['app'] only_model = options['model'] skip_hash = options['skip_hash'] + run_phase1 = options['phase1'] + run_phase2 = options['phase2'] + # if neither flag given, run both + if not run_phase1 and not run_phase2: + run_phase1 = run_phase2 = True + start_time = time.time() if dry_run: self.stdout.write(self.style.WARNING('DRY RUN — no changes will be written.')) - targets = [ - (app, model, field) - for (app, model, field) in METADATA_FILE_FIELDS - if (only_app is None or app == only_app) - and (only_model is None or model == only_model) - ] - total_created = 0 - total_skipped = 0 + total_updated = 0 total_errors = 0 - for app_label, model_name, field_name in targets: - try: - Model = apps.get_model(app_label, model_name) - except LookupError: - self.stdout.write( - self.style.ERROR(f'Model {app_label}.{model_name} not found — skipping.')) - continue - - meta_fk = f'{field_name}_metadata' - # Only process rows where file is set but metadata FK is NULL. - qs = Model.objects.filter( - **{f'{field_name}__isnull': False, - f'{meta_fk}__isnull': True} - ).exclude( - **{field_name: ''} - ).only('pk', field_name, meta_fk) - - count = qs.count() - if count == 0: + # ── Phase 1: create FileMetadata rows for FK-NULL parent instances ──────── + + if run_phase1: + self.stdout.write('Phase 1: creating missing FileMetadata rows...') + + targets = [ + (app, model, field) + for (app, model, field) in METADATA_FILE_FIELDS + if (only_app is None or app == only_app) + and (only_model is None or model == only_model) + ] + + for app_label, model_name, field_name in targets: + try: + Model = apps.get_model(app_label, model_name) + except LookupError: + self.stdout.write( + self.style.ERROR(f' Model {app_label}.{model_name} not found — skipping.')) + continue + + meta_fk = f'{field_name}_metadata' + qs = Model.objects.filter( + **{f'{field_name}__isnull': False, + f'{meta_fk}__isnull': True} + ).exclude( + **{field_name: ''} + ).only('pk', field_name, meta_fk) + + count = qs.count() + if count == 0: + self.stdout.write( + f' {app_label}.{model_name}.{field_name}: up to date.') + continue + self.stdout.write( - f'{app_label}.{model_name}.{field_name}: already up to date.') - continue + f' {app_label}.{model_name}.{field_name}: {count} rows to backfill...') - self.stdout.write( - f'{app_label}.{model_name}.{field_name}: {count} rows to backfill...') + batch_start = time.time() + processed = 0 - batch_start = time.time() - processed = 0 + for instance in qs.iterator(chunk_size=batch_size): + field_file = getattr(instance, field_name) + storage_name = field_file.name + original_filename = Path(storage_name).name + full_path = os.path.join(settings.MEDIA_ROOT, storage_name) - for instance in qs.iterator(chunk_size=batch_size): - field_file = getattr(instance, field_name) - storage_name = field_file.name # relative path stored in DB - original_filename = Path(storage_name).name + file_size_bytes = None + content_hash = '' - full_path = os.path.join(settings.MEDIA_ROOT, storage_name) + try: + stat = os.stat(full_path) + file_size_bytes = stat.st_size + if not skip_hash: + content_hash = _compute_hash(full_path) + except OSError as e: + self.stdout.write( + self.style.WARNING( + f' pk={instance.pk}: cannot read {full_path}: {e}')) + total_errors += 1 - file_size_bytes = None - content_hash = '' + if dry_run: + self.stdout.write( + f' [dry-run] pk={instance.pk} → {storage_name}') + total_created += 1 + processed += 1 + continue - try: - stat = os.stat(full_path) - file_size_bytes = stat.st_size - if not skip_hash: - content_hash = _compute_hash(full_path) - except OSError as e: - self.stdout.write( - self.style.WARNING( - f' pk={instance.pk}: cannot read {full_path}: {e}')) - total_errors += 1 + with transaction.atomic(): + meta = FileMetadata( + storage_name=storage_name, + original_filename=original_filename, + file_size_bytes=file_size_bytes, + content_hash=content_hash, + backfilled_at=timezone.now(), + app_label=app_label, + model_name=model_name, + field_name=field_name, + owner_pk=instance.pk, + ) + meta.save() + setattr(instance, f'{meta_fk}_id', meta.pk) + instance.save(update_fields=[f'{meta_fk}_id']) - if dry_run: - self.stdout.write( - f' [dry-run] pk={instance.pk} → {storage_name}') total_created += 1 processed += 1 - continue - with transaction.atomic(): - meta = FileMetadata( - storage_name=storage_name, - original_filename=original_filename, - file_size_bytes=file_size_bytes, - content_hash=content_hash, - backfilled_at=timezone.now(), - ) - meta.save() - setattr(instance, f'{meta_fk}_id', meta.pk) - instance.save(update_fields=[f'{meta_fk}_id']) - - total_created += 1 - processed += 1 - - if rate_limit > 0: - elapsed = time.time() - batch_start - target_elapsed = processed / rate_limit - if target_elapsed > elapsed: - time.sleep(target_elapsed - elapsed) - - self.stdout.write( - self.style.SUCCESS( - f' done: {processed} rows processed.')) + if rate_limit > 0: + elapsed = time.time() - batch_start + target_elapsed = processed / rate_limit + if target_elapsed > elapsed: + time.sleep(target_elapsed - elapsed) + + self.stdout.write( + self.style.SUCCESS(f' done: {processed} rows created.')) + else: + self.stdout.write('Phase 1: skipped.') + + # ── Phase 2: fill in content_hash / file_size_bytes for incomplete rows ── + # RFC §8: query WHERE content_hash = '' OR file_size_bytes IS NULL. + # --app/--model do not restrict Phase 2 (it operates on FileMetadata directly). + + if run_phase2: + self.stdout.write('Phase 2: filling in missing size/hash for existing rows...') + + incomplete_qs = FileMetadata.objects.filter( + Q(content_hash='') | Q(file_size_bytes__isnull=True) + ) + count = incomplete_qs.count() + + if count == 0: + self.stdout.write(' all rows already have size and hash.') + else: + self.stdout.write(f' {count} rows to update...') + + batch_start = time.time() + processed = 0 + for meta in incomplete_qs.iterator(chunk_size=batch_size): + full_path = os.path.join(settings.MEDIA_ROOT, meta.storage_name) + + try: + stat = os.stat(full_path) + file_size_bytes = stat.st_size + content_hash = ( + _compute_hash(full_path) + if not skip_hash and not meta.content_hash + else meta.content_hash + ) + except OSError as e: + self.stdout.write( + self.style.WARNING( + f' uuid={meta.uuid}: cannot read {full_path}: {e}')) + total_errors += 1 + continue + + if dry_run: + self.stdout.write( + f' [dry-run] uuid={meta.uuid} → size={file_size_bytes}') + total_updated += 1 + processed += 1 + continue + + update_fields = ['backfilled_at'] + if meta.file_size_bytes is None: + meta.file_size_bytes = file_size_bytes + update_fields.append('file_size_bytes') + if not meta.content_hash and not skip_hash: + meta.content_hash = content_hash + update_fields.append('content_hash') + meta.backfilled_at = timezone.now() + meta.save(update_fields=update_fields) + + total_updated += 1 + processed += 1 + + if rate_limit > 0: + elapsed = time.time() - batch_start + target_elapsed = processed / rate_limit + if target_elapsed > elapsed: + time.sleep(target_elapsed - elapsed) + + self.stdout.write( + self.style.SUCCESS(f' done: {processed} rows updated.')) + else: + self.stdout.write('Phase 2: skipped.') + + elapsed = time.time() - start_time self.stdout.write('') self.stdout.write( - f'Backfill complete — created: {total_created}, ' - f'errors (file missing): {total_errors}.') + f'Backfill complete — created: {total_created}, updated: {total_updated}, ' + f'errors (file missing): {total_errors}, elapsed: {elapsed:.1f}s' + ) if dry_run: self.stdout.write(self.style.WARNING('(dry run — nothing was written)')) diff --git a/sapl/base/management/commands/backfill_file_metadata_hashes.py b/sapl/base/management/commands/backfill_file_metadata_hashes.py new file mode 100644 index 000000000..65ce11488 --- /dev/null +++ b/sapl/base/management/commands/backfill_file_metadata_hashes.py @@ -0,0 +1,111 @@ +""" +Phase 2 backfill — fills file_size_bytes and content_hash for FileMetadata rows +where backfilled_at IS NULL. I/O-bound; designed to run at low priority over +days or weeks without affecting production traffic. + +Safe to interrupt and resume: sets backfilled_at on each completed row. + +Usage: + python manage.py backfill_file_metadata_hashes [--batch-size=200] [--rate-limit=20] + +Progress query: + SELECT + COUNT(*) FILTER (WHERE backfilled_at IS NULL) AS pending, + COUNT(*) FILTER (WHERE backfilled_at IS NOT NULL) AS done + FROM base_file_metadata; +""" +import hashlib +import os +import time + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.utils import timezone + + +def _compute_hash(path): + h = hashlib.sha256() + with open(path, 'rb') as fh: + for chunk in iter(lambda: fh.read(65536), b''): + h.update(chunk) + return h.hexdigest() + + +class Command(BaseCommand): + help = ( + 'Phase 2 backfill: fill file_size_bytes and content_hash for FileMetadata rows ' + 'where backfilled_at IS NULL. Resumable and rate-limited.' + ) + + def add_arguments(self, parser): + parser.add_argument( + '--batch-size', type=int, default=200, + help='Rows per iteration (default: 200).', + ) + parser.add_argument( + '--rate-limit', type=int, default=0, + help='Max rows per second (0 = unlimited).', + ) + parser.add_argument( + '--skip-hash', action='store_true', + help='Only populate file_size_bytes, skip SHA-256 (faster).', + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Report counts without writing.', + ) + + def handle(self, *args, **options): + from sapl.base.models import FileMetadata + + batch_size = options['batch_size'] + rate_limit = options['rate_limit'] + skip_hash = options['skip_hash'] + dry_run = options['dry_run'] + + if dry_run: + self.stdout.write(self.style.WARNING('DRY RUN — nothing will be written.')) + + qs = FileMetadata.objects.filter(backfilled_at__isnull=True) + total = qs.count() + self.stdout.write(f'Phase 2: {total} rows to process...') + + processed = 0 + errors = 0 + batch_start = time.time() + + for meta in qs.iterator(chunk_size=batch_size): + full_path = os.path.join(settings.MEDIA_ROOT, meta.storage_name) + try: + stat = os.stat(full_path) + file_size_bytes = stat.st_size + content_hash = _compute_hash(full_path) if not skip_hash else '' + except OSError as e: + self.stdout.write(self.style.WARNING( + f' uuid={meta.uuid}: cannot read {full_path}: {e}')) + errors += 1 + continue + + if not dry_run: + update_fields = ['backfilled_at'] + meta.backfilled_at = timezone.now() + if meta.file_size_bytes is None: + meta.file_size_bytes = file_size_bytes + update_fields.append('file_size_bytes') + if not meta.content_hash and not skip_hash: + meta.content_hash = content_hash + update_fields.append('content_hash') + meta.save(update_fields=update_fields) + + processed += 1 + + if rate_limit > 0: + elapsed = time.time() - batch_start + target = processed / rate_limit + if target > elapsed: + time.sleep(target - elapsed) + + self.stdout.write(self.style.SUCCESS( + f'Phase 2 complete — processed: {processed}, errors: {errors}')) + if dry_run: + self.stdout.write(self.style.WARNING('(dry run — nothing was written)')) diff --git a/sapl/base/management/commands/backfill_file_metadata_structural.py b/sapl/base/management/commands/backfill_file_metadata_structural.py new file mode 100644 index 000000000..512d598b7 --- /dev/null +++ b/sapl/base/management/commands/backfill_file_metadata_structural.py @@ -0,0 +1,153 @@ +""" +Phase 1 backfill — creates FileMetadata rows for parent-model instances whose +_metadata FK is NULL. Pure DB work, no disk I/O. Completes in seconds. + +Safe to re-run (idempotent). Safe to run from multiple workers simultaneously +(bulk_create with ignore_conflicts=True handles races). + +Usage: + python manage.py backfill_file_metadata_structural [--batch-size=1000] +""" +from pathlib import Path + +from django.apps import apps +from django.core.management.base import BaseCommand +from django.db import transaction + +METADATA_FILE_FIELDS = [ + ('materia', 'materialegislativa', 'texto_original'), + ('materia', 'documentoacessorio', 'arquivo'), + ('materia', 'proposicao', 'texto_original'), + ('protocoloadm', 'documentoadministrativo', 'texto_integral'), + ('protocoloadm', 'documentoacessorioadministrativo', 'arquivo'), + ('norma', 'normajuridica', 'texto_integral'), + ('norma', 'anexonormajuridica', 'anexo_arquivo'), + ('comissoes', 'reuniao', 'upload_pauta'), + ('comissoes', 'reuniao', 'upload_ata'), + ('comissoes', 'reuniao', 'upload_anexo'), + ('comissoes', 'documentoacessorio', 'arquivo'), + ('audiencia', 'audienciapublica', 'upload_pauta'), + ('audiencia', 'audienciapublica', 'upload_ata'), + ('audiencia', 'audienciapublica', 'upload_anexo'), + ('audiencia', 'anexoaudienciapublica', 'arquivo'), + ('sessao', 'sessaoplenaria', 'upload_pauta'), + ('sessao', 'sessaoplenaria', 'upload_ata'), + ('sessao', 'sessaoplenaria', 'upload_anexo'), + ('sessao', 'justificativaausencia', 'upload_anexo'), + ('sessao', 'orador', 'upload_anexo'), + ('sessao', 'oradorexpediente', 'upload_anexo'), + ('sessao', 'oradorordemdia', 'upload_anexo'), +] + + +class Command(BaseCommand): + help = ( + 'Phase 1 backfill: create FileMetadata rows for existing files (no disk I/O). ' + 'Idempotent — skips instances where _metadata FK is already set.' + ) + + def add_arguments(self, parser): + parser.add_argument( + '--batch-size', type=int, default=1000, + help='Rows per bulk_create batch (default: 1000).', + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Report counts without writing.', + ) + + def handle(self, *args, **options): + from sapl.base.models import FileMetadata + + batch_size = options['batch_size'] + dry_run = options['dry_run'] + + if dry_run: + self.stdout.write(self.style.WARNING('DRY RUN — nothing will be written.')) + + total_created = 0 + + for app_label, model_name, field_name in METADATA_FILE_FIELDS: + try: + Model = apps.get_model(app_label, model_name) + except LookupError: + self.stdout.write(self.style.ERROR( + f' {app_label}.{model_name} not found — skipping.')) + continue + + meta_fk = f'{field_name}_metadata' + qs = ( + Model.objects + .filter(**{f'{field_name}__isnull': False, f'{meta_fk}__isnull': True}) + .exclude(**{field_name: ''}) + .only('pk', field_name, meta_fk) + ) + count = qs.count() + if count == 0: + self.stdout.write(f' {app_label}.{model_name}.{field_name}: up to date.') + continue + + self.stdout.write(f' {app_label}.{model_name}.{field_name}: {count} rows...') + + if dry_run: + total_created += count + continue + + # Process in batches: bulk_create rows, then bulk_update the FK back. + batch = [] + instances = [] + for instance in qs.iterator(chunk_size=batch_size): + field_file = getattr(instance, field_name) + storage_name = field_file.name + batch.append(FileMetadata( + storage_name=storage_name, + original_filename=Path(storage_name).name, + app_label=app_label, + model_name=model_name, + field_name=field_name, + owner_pk=instance.pk, + )) + instances.append(instance) + + if len(batch) >= batch_size: + total_created += self._flush( + batch, instances, meta_fk, field_name, Model) + batch = [] + instances = [] + + if batch: + total_created += self._flush(batch, instances, meta_fk, field_name, Model) + + self.stdout.write(self.style.SUCCESS(f' done.')) + + self.stdout.write(f'Structural backfill complete — created: {total_created}') + if dry_run: + self.stdout.write(self.style.WARNING('(dry run — nothing was written)')) + + def _flush(self, batch, instances, meta_fk, field_name, Model): + from sapl.base.models import FileMetadata + + with transaction.atomic(): + created = FileMetadata.objects.bulk_create(batch, ignore_conflicts=True) + + # Re-query to get PKs for the rows we just inserted (bulk_create may not + # return PKs on all DB backends, and ignore_conflicts rows have pk=None). + storage_names = [m.storage_name for m in batch] + meta_map = { + m.storage_name: m.pk + for m in FileMetadata.objects.filter(storage_name__in=storage_names) + } + + update_instances = [] + for instance in instances: + field_file = getattr(instance, field_name) + pk = meta_map.get(field_file.name) + if pk: + setattr(instance, f'{meta_fk}_id', pk) + update_instances.append(instance) + + if update_instances: + with transaction.atomic(): + Model.objects.bulk_update(update_instances, [f'{meta_fk}_id']) + + return len(created) diff --git a/sapl/base/migrations/0062_filemetadata_owner_fields.py b/sapl/base/migrations/0062_filemetadata_owner_fields.py new file mode 100644 index 000000000..5abd9579a --- /dev/null +++ b/sapl/base/migrations/0062_filemetadata_owner_fields.py @@ -0,0 +1,43 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('base', '0061_file_metadata'), + ] + + operations = [ + migrations.AddField( + model_name='filemetadata', + name='app_label', + field=models.CharField(blank=True, default='', max_length=100), + ), + migrations.AddField( + model_name='filemetadata', + name='model_name', + field=models.CharField(blank=True, default='', max_length=100), + ), + migrations.AddField( + model_name='filemetadata', + name='field_name', + field=models.CharField(blank=True, default='', max_length=100), + ), + migrations.AddField( + model_name='filemetadata', + name='owner_pk', + field=models.PositiveIntegerField(blank=True, null=True), + ), + migrations.AlterField( + model_name='filemetadata', + name='storage_name', + field=models.CharField(editable=False, max_length=512, verbose_name='Storage name'), + ), + migrations.AddIndex( + model_name='filemetadata', + index=models.Index( + fields=['app_label', 'model_name', 'field_name'], + name='filemetadata_owner_context_idx', + ), + ), + ] diff --git a/sapl/base/models.py b/sapl/base/models.py index f6475c56b..cf6c62ab0 100644 --- a/sapl/base/models.py +++ b/sapl/base/models.py @@ -511,6 +511,7 @@ class FileMetadata(models.Model): ) storage_name = models.CharField( max_length=512, + editable=False, verbose_name=_('Storage name'), ) original_filename = models.CharField( @@ -537,11 +538,21 @@ class FileMetadata(models.Model): blank=True, verbose_name=_('Backfilled at'), ) + app_label = models.CharField(max_length=100, default='', blank=True) + model_name = models.CharField(max_length=100, default='', blank=True) + field_name = models.CharField(max_length=100, default='', blank=True) + owner_pk = models.PositiveIntegerField(null=True, blank=True) class Meta: db_table = 'base_file_metadata' verbose_name = _('File Metadata') verbose_name_plural = _('File Metadata') + indexes = [ + models.Index( + fields=['app_label', 'model_name', 'field_name'], + name='filemetadata_owner_context_idx', + ), + ] def __str__(self): return f'{self.original_filename} (v{self.version})' diff --git a/sapl/base/views.py b/sapl/base/views.py index d0f014b35..ee6867411 100644 --- a/sapl/base/views.py +++ b/sapl/base/views.py @@ -1575,11 +1575,26 @@ def pesquisa_textual(request): # File-serving views (RFC §6.4, §9) # --------------------------------------------------------------------------- +import logging as _logging # noqa: E402 import os as _os # noqa: E402 -from urllib.parse import quote # noqa: E402 — kept near usage site +from django.http import FileResponse, HttpResponse, HttpResponseForbidden # noqa: E402 -from django.http import FileResponse, HttpResponse # noqa: E402 +from sapl.base.fields import ( # noqa: E402 + _PRIVATE_FIELDS, _content_disposition, _is_public, +) + +_serve_logger = _logging.getLogger(__name__) + +try: + from prometheus_client import Counter as _Counter + _metadata_fallback_counter = _Counter( + 'sapl_metadata_fallback_total', + 'Requests served via legacy pre-metadata fallback', + ['app', 'model', 'field'], + ) +except Exception: + _metadata_fallback_counter = None SERVE_FILE_FIELDS = frozenset({ ('materia', 'materialegislativa', 'texto_original'), @@ -1604,31 +1619,63 @@ SERVE_FILE_FIELDS = frozenset({ }) -def serve_file(request, file_uuid): +def can_download_file(user, meta, owner_object=None) -> bool: + """Single authorization gate for all file downloads (RFC §6.4.1).""" + key = (meta.app_label, meta.model_name, meta.field_name) + + if key not in _PRIVATE_FIELDS: + return True + + # Lazy-load owner_object from DB when not provided. + if owner_object is None and meta.owner_pk is not None: + try: + model = apps.get_model(meta.app_label, meta.model_name) + owner_object = model.objects.filter(pk=meta.owner_pk).first() + except LookupError: + pass + + if key == ('materia', 'proposicao', 'texto_original'): + if user.is_staff: + return True + if owner_object and hasattr(owner_object, 'autor'): + return owner_object.autor.filter(user_profile__user=user).exists() + return False + + if meta.app_label == 'protocoloadm' and meta.model_name == 'documentoadministrativo': + if user.is_staff: + return True + return owner_object is not None and not owner_object.restrito + + if key == ('protocoloadm', 'documentoacessorioadministrativo', 'arquivo'): + if user.is_staff: + return True + parent = getattr(owner_object, 'documento', None) if owner_object else None + return parent is not None and not parent.restrito + + return False + + +def serve_file(request, file_uuid, owner_object=None): """ Secure file-serving view — the single chokepoint for all document downloads. - Resolves uuid → FileMetadata → storage_name, performs a permission check - (currently public files pass unconditionally), then delegates the actual - byte transfer to nginx via X-Accel-Redirect. Django never reads file bytes - into Python memory; nginx's sendfile delivers them zero-copy. - - The /media/ location in nginx must be marked 'internal' so that clients - cannot bypass this view and fetch files directly. + Resolves uuid → FileMetadata → storage_name, enforces access control, then + delegates byte transfer to nginx via X-Accel-Redirect. Django never reads + file bytes into memory. """ from sapl.base.models import FileMetadata from django.shortcuts import get_object_or_404 as _get_or_404 meta = _get_or_404(FileMetadata, uuid=file_uuid) - # Permission check — currently unconditional for public files. - # When DocumentoAdministrativo.restrito / nivel_restricao is wired, - # insert the per-file check here (RFC §6.4). + if not can_download_file(request.user, meta, owner_object): + if not request.user.is_authenticated: + raise Http404 + return HttpResponseForbidden() display_name = meta.original_filename or Path(meta.storage_name).name if settings.DEBUG: - # runserver has no nginx: serve the bytes directly from the filesystem. file_path = _os.path.join(settings.MEDIA_ROOT, meta.storage_name) try: fh = open(file_path, 'rb') @@ -1636,30 +1683,16 @@ def serve_file(request, file_uuid): raise Http404 return FileResponse(fh, as_attachment=False, filename=display_name) - # Production: delegate byte transfer to nginx via X-Accel-Redirect. - # storage_name is relative to MEDIA_ROOT (e.g. "sapl/public/norma/…/file.pdf"). - internal_path = f'/media/{meta.storage_name}' - response = HttpResponse() - response['X-Accel-Redirect'] = internal_path - - # RFC 6266 — dual filename parameter: ASCII fallback + UTF-8 encoded. - filename_ascii = display_name.encode('ascii', 'replace').decode() - filename_encoded = quote(display_name, safe='') - response['Content-Disposition'] = ( - f'inline; filename="{filename_ascii}"' - f"; filename*=UTF-8''{filename_encoded}" - ) + response['X-Accel-Redirect'] = f'/media/{meta.storage_name}' + response['Content-Disposition'] = _content_disposition(display_name) + response['Cache-Control'] = 'public, max-age=300' if _is_public(meta) else 'no-store' return response def serve_model_file(request, app_label, model_name, pk, field_name): """ Semantic alias for file downloads: /////download - - Validates the (app_label, model_name, field_name) triple against an explicit - allowlist, fetches the parent model instance, resolves the _metadata FK, then - delegates to serve_file. All permission logic lives in serve_file only. """ from django.shortcuts import get_object_or_404 as _get_or_404 @@ -1673,10 +1706,32 @@ def serve_model_file(request, app_label, model_name, pk, field_name): instance = _get_or_404(model, pk=pk) meta = getattr(instance, f'{field_name}_metadata', None) - if meta is None: - raise Http404 - return serve_file(request, file_uuid=meta.uuid) + if meta is None: + # Temporary fallback while backfill is in progress. + field_file = getattr(instance, field_name) + if not field_file: + raise Http404 + _serve_logger.warning( + 'serve_model_file fallback: metadata NULL for %s/%s/%s pk=%s', + app_label, model_name, field_name, pk, + ) + if _metadata_fallback_counter is not None: + _metadata_fallback_counter.labels( + app=app_label, model=model_name, field=field_name, + ).inc() + if settings.DEBUG: + file_path = _os.path.join(settings.MEDIA_ROOT, field_file.name) + try: + fh = open(file_path, 'rb') + except OSError: + raise Http404 + return FileResponse(fh) + response = HttpResponse() + response['X-Accel-Redirect'] = f'/media/{field_file.name}' + return response + + return serve_file(request, file_uuid=meta.uuid, owner_object=instance) # Image fields served via X-Accel-Redirect — same nginx internal mechanism as