From 64c9b241fab85c8b31c524d11a53074794d45217 Mon Sep 17 00:00:00 2001 From: Edward Oliveira Date: Thu, 16 Apr 2026 17:50:44 -0300 Subject: [PATCH] Phase 5: X-Accel-Redirect for /media/, UA Redis deny list, per-path counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nginx: - /media/ proxied through Gunicorn (sapl_general rate limit) instead of direct alias — Django middleware now runs on every media request - /_accel/media/ internal location serves file bytes via X-Accel-Redirect sapl/base/media.py (new): - serve_media() gate: path traversal guard, auth redirect for documentos_privados/, per-path Redis counter, content-type metadata cache, X-Accel-Redirect response; falls back to Django serve() in DEBUG sapl/middleware/ratelimit.py: - RL_PATH_REQUESTS, RL_UA_BLOCKLIST, FILE_META_KEY constants - _incr_with_ttl() extracted to module level (reused by media.py) - Runtime UA deny list: _refresh_ua_blocklist() fetches rl:bot:ua:blocked SET from Redis (SMEMBERS, cached per worker, TTL=RATE_LIMITER_UA_BLOCKLIST_REFRESH); _is_redis_blocked_ua() tokenises UA and checks sha256 of each token sapl/settings.py: - RATE_LIMITER_UA_BLOCKLIST_REFRESH, MEDIA_PATH_COUNTER_TTL, MEDIA_FILE_CACHE_TTL added (all env-tunable via config()) plan/RATE_LIMITER_PLAN.md: - Key schema table updated; media file serving section added; decision flow documented; UA deny list seed section expanded Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 145 +++ docker/config/nginx/sapl.conf | 34 +- docker/scripts/redis_inject_test_data.py | 169 +++ plan/RATE_LIMITER_PLAN.md | 474 +++++++++ plan/rate-limiter-v2.md | 1231 ++++++++++++++++++++++ sapl/base/media.py | 96 ++ sapl/middleware/ratelimit.py | 106 +- sapl/settings.py | 126 ++- sapl/urls.py | 13 +- 9 files changed, 2303 insertions(+), 91 deletions(-) create mode 100644 CLAUDE.md create mode 100644 docker/scripts/redis_inject_test_data.py create mode 100644 plan/RATE_LIMITER_PLAN.md create mode 100644 plan/rate-limiter-v2.md create mode 100644 sapl/base/media.py diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..b3a1102eb --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,145 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +SAPL (Sistema de Apoio ao Processo Legislativo) is a Django-based legislative management system used by Brazilian municipal and state legislative houses. It manages bills, parliamentary sessions, committees, norms, protocols, and related legislative workflows. + +## Commands + +### Development + +```bash +# Run dev server +python manage.py runserver + +# Docker (dev, without bundled DB) +docker-compose -f docker/docker-compose-dev.yml up + +# Docker (dev, with PostgreSQL container) +docker-compose -f docker/docker-compose-dev-db.yml up +``` + +### Database Setup (local PostgreSQL) + +```bash +sudo -u postgres psql -c "CREATE ROLE sapl LOGIN ENCRYPTED PASSWORD 'sapl' NOSUPERUSER INHERIT CREATEDB NOCREATEROLE NOREPLICATION;" +sudo -u postgres psql -c "CREATE DATABASE sapl WITH OWNER=sapl ENCODING='UTF8' LC_COLLATE='pt_BR.UTF-8' LC_CTYPE='pt_BR.UTF-8' CONNECTION LIMIT=-1 TEMPLATE template0;" +python manage.py migrate +``` + +### Testing + +```bash +# All tests (reuses DB by default for speed) +pytest + +# Single test file or test function +pytest sapl/materia/tests/test_materia.py +pytest sapl/materia/tests/test_materia.py::test_function_name + +# Force DB recreation +pytest --create-db + +# With coverage +pytest --cov=sapl +``` + +Tests require `DJANGO_SETTINGS_MODULE=sapl.settings` (set in `pytest.ini`). All tests must be marked with `@pytest.mark.django_db`. The `conftest.py` root fixture provides an `app` fixture (WebTest `DjangoTestApp`). + +### Linting / Formatting + +```bash +flake8 . +isort . +autopep8 --in-place +``` + +### Restore Database from Backup + +```bash +./scripts/restore_db.sh -f /path/to/dump +./scripts/restore_db.sh -f /path/to/dump -p 5433 # Docker port +``` + +## Architecture + +### Django Apps + +Apps are under `sapl/` and follow domain boundaries: + +| App | Domain | +|-----|--------| +| `base` | `CasaLegislativa` (legislative house config), `AppConfig`, `Autor` (authorship) | +| `parliamentary` | `Parlamentar`, `Legislatura`, `SessaoLegislativa`, `Coligacao` | +| `materia` | Bills (`MateriaLegislativa`), types, tracking, annexes | +| `norma` | Laws/norms (`NormaJuridica`) and hierarchies | +| `sessao` | Plenary sessions, agenda, attendance, voting | +| `comissoes` | Committees (`Comissao`) and meetings (`Reuniao`) | +| `protocoloadm` | Administrative protocols and document intake | +| `compilacao` | Structured/articulated texts (LexML-like tree structure) | +| `lexml` | LexML XML standard integration | +| `audiencia` | Public hearings | +| `painel` | Real-time session display panel | +| `relatorios` | PDF report generation | +| `api` | REST API entry point (auto-generated ViewSets) | +| `crud` | Generic CRUD base views | +| `rules` | Business rules and permission definitions | + +### REST API + +The API uses a custom `drfautoapi` package (`drfautoapi/drfautoapi.py`) that auto-generates DRF ViewSets, Serializers, and FilterSets from Django models. Authentication is Token + Session. Permissions use a custom `SaplModelPermissions` class that maps HTTP methods to Django model permissions. + +OpenAPI 3.0 docs are generated by drf-spectacular. + +### Caching + +- **Default:** File-based (`/var/tmp/django_cache`) +- **Production:** Redis via django-redis; configured at startup by `configure_redis_cache()` in `sapl/settings.py` +- **Cache key prefix:** `cache:{POD_NAMESPACE}:` (namespace-isolated for multi-tenant k8s) +- **Rate limiter state** is shared via Redis keys + +### Feature Flags + +django-waffle is used for feature flags. Switches (global on/off) can be toggled via: + +```bash +python manage.py waffle_switch on|off +``` + +### Key Environment Variables + +| Variable | Purpose | +|----------|---------| +| `DATABASE_URL` | PostgreSQL connection string | +| `SECRET_KEY` | Django secret key | +| `DEBUG` | Debug mode | +| `REDIS_URL` | Redis host:port | +| `CACHE_BACKEND` | `file` or `redis` | +| `POD_NAMESPACE` | K8s namespace (used in cache key prefix) | +| `USE_SOLR` | Enable Haystack/Solr full-text search | +| `SOLR_URL` / `SOLR_COLLECTION` | Solr connection | + +### Docker Build + +The production build requires a MaxMind GeoLite2-ASN license key (for nginx ASN-based bot blocking): + +```bash +docker build --secret id=maxmind_key,src=.env -f docker/Dockerfile -t sapl:local . +``` + +Optional build args: `WITH_NGINX`, `WITH_GRAPHVIZ`, `WITH_POPPLER`, `WITH_PSQL_CLIENT`. + +### Key File Locations + +| File | Purpose | +|------|---------| +| `sapl/settings.py` | All Django settings, including cache/rate-limit setup | +| `pytest.ini` | Test configuration (DJANGO_SETTINGS_MODULE, addopts) | +| `conftest.py` | Root pytest fixtures | +| `drfautoapi/drfautoapi.py` | Auto-API generation logic | +| `docker/startup_scripts/start.sh` | Container entrypoint (migrations, waffle, gunicorn) | +| `requirements/requirements.txt` | Production deps | +| `requirements/test-requirements.txt` | Test deps | +| `requirements/dev-requirements.txt` | Dev/lint deps | diff --git a/docker/config/nginx/sapl.conf b/docker/config/nginx/sapl.conf index 16d26e7c7..9123181a8 100644 --- a/docker/config/nginx/sapl.conf +++ b/docker/config/nginx/sapl.conf @@ -45,21 +45,28 @@ server { } # ---------------------------------------------------------------- - # Media files — FIX: add ETags and Cache-Control headers. - # sendfile on + etag on converts repeat bot requests to 304s. + # Media files — routed through Django for auth, rate counting, + # and content-type caching; served from disk via X-Accel-Redirect. # ---------------------------------------------------------------- location /media/ { - alias /var/interlegis/sapl/media/; - sendfile on; - etag on; - add_header Cache-Control "public, max-age=86400, stale-while-revalidate=3600"; - add_header X-Robots-Tag "noindex" always; + limit_req zone=sapl_general burst=${NGINX_BURST_GENERAL} nodelay; + limit_req_status 429; + + proxy_set_header X-Request-ID $req_id; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Host $http_host; + proxy_redirect off; + proxy_pass http://sapl_server; } - # Private documents — X-Accel-Redirect after auth check in Django. - location /media/documentos_privados/ { + # Internal location used exclusively by X-Accel-Redirect responses + # from serve_media(). Not reachable by external clients. + location /_accel/media/ { internal; - alias /var/interlegis/sapl/media/documentos_privados/; + alias /var/interlegis/sapl/media/; + sendfile on; + etag on; } # ---------------------------------------------------------------- @@ -67,7 +74,7 @@ server { # Tighter rate limit; extended timeout for uncached generation. # ---------------------------------------------------------------- location /relatorios/ { - limit_req zone=sapl_heavy burst=5 nodelay; + limit_req zone=sapl_heavy burst=${NGINX_BURST_HEAVY} nodelay; limit_req_status 429; proxy_read_timeout 180s; @@ -102,7 +109,7 @@ server { # /api/ — rate limited, CORS maintained from original config. # ---------------------------------------------------------------- location /api/ { - limit_req zone=sapl_general burst=30 nodelay; + limit_req zone=sapl_general burst=${NGINX_BURST_API} nodelay; limit_req_status 429; add_header 'Access-Control-Allow-Origin' '*'; @@ -134,7 +141,7 @@ server { # General traffic — moderate rate limit. # ---------------------------------------------------------------- location / { - limit_req zone=sapl_general burst=20 nodelay; + limit_req zone=sapl_general burst=${NGINX_BURST_GENERAL} nodelay; limit_req_status 429; proxy_set_header X-Request-ID $req_id; @@ -147,6 +154,7 @@ server { error_page 429 /429.html; location = /429.html { + add_header Retry-After 60 always; root /var/interlegis/sapl/sapl/static/; internal; } diff --git a/docker/scripts/redis_inject_test_data.py b/docker/scripts/redis_inject_test_data.py new file mode 100644 index 000000000..94dc0a1fe --- /dev/null +++ b/docker/scripts/redis_inject_test_data.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +redis_inject_test_data.py — inject synthetic rate-limiter entries into Redis. + +Purpose: validate that RateLimitMiddleware reads the expected key schema, +that Redis CLI / RedisInsight shows the right structure, and that blocking +logic fires correctly without waiting for real traffic. + +Usage: + # Against docker-compose Redis (default) + python3 docker/scripts/redis_inject_test_data.py + + # Against a different host/port + REDIS_URL=redis://localhost:6379 python3 docker/scripts/redis_inject_test_data.py + + # Clear all synthetic keys written by a previous run + CLEAR=1 python3 docker/scripts/redis_inject_test_data.py + +Key schema (DB 1 — rate limiter): + rl:ip:{ip}:reqs INCR counter — anonymous request count (TTL 60s) + rl:ip:{ip}:blocked string "1" — IP hard-blocked (TTL 300s) + rl:{ns}:user:{uid}:reqs INCR counter — auth user request count (TTL 60s) + rl:{ns}:user:{uid}:blocked string "1" — user hard-blocked (TTL 300s) + rl:{ns}:ip:{ip}:w:{bucket} INCR — namespace/IP sliding window (TTL 120s) +""" + +import os +import sys +import time +from decouple import config + +# ── dependency check ────────────────────────────────────────────────────── +try: + import redis +except ImportError: + print("ERROR: redis-py not installed. Run: pip install redis", file=sys.stderr) + sys.exit(1) + +# ── config ──────────────────────────────────────────────────────────────── +REDIS_URL = config("REDIS_URL", default="redis://localhost:6379") +RATELIMIT_DB = 1 # DB1 is the rate-limiter database +CLEAR = config("CLEAR", default="0").lower() in ("1", "true", "yes") + +# Synthetic values — tweak to exercise different code paths +NAMESPACE = "sapl" # POD_NAMESPACE value (hostname or k8s namespace) +ANON_WINDOW = 60 # seconds — must match settings.RATE_LIMITER_RATE period +AUTH_WINDOW = 60 +BLOCK_TTL = 300 + +TEST_IPS = [ + "203.0.113.1", # below threshold (20 reqs) + "203.0.113.2", # AT threshold (35 reqs — should trigger block) + "203.0.113.3", # already blocked + "203.0.113.4", # namespace/window counter near threshold +] + +TEST_USERS = [ + {"uid": "42", "reqs": 50, "blocked": False}, # normal auth user + {"uid": "99", "reqs": 120, "blocked": False}, # AT auth threshold + {"uid": "7", "reqs": 10, "blocked": True}, # pre-blocked user +] + +# ── helpers ─────────────────────────────────────────────────────────────── + +def key_ip_reqs(ip): + return f"rl:ip:{ip}:reqs" + +def key_ip_blocked(ip): + return f"rl:ip:{ip}:blocked" + +def key_user_reqs(ns, uid): + return f"rl:{ns}:user:{uid}:reqs" + +def key_user_blocked(ns, uid): + return f"rl:{ns}:user:{uid}:blocked" + +def key_ns_window(ns, ip, bucket): + return f"rl:{ns}:ip:{ip}:w:{bucket}" + + +def write(r, key, value, ttl, label): + if isinstance(value, int): + pipe = r.pipeline() + pipe.set(key, value, ex=ttl) + pipe.execute() + else: + r.set(key, value, ex=ttl) + print(f" SET {key!r} = {value!r} EX {ttl}s ({label})") + + +def delete_pattern(r, pattern): + keys = r.keys(pattern) + if keys: + r.delete(*keys) + print(f" DEL {len(keys)} keys matching {pattern!r}") + else: + print(f" (no keys matching {pattern!r})") + + +# ── main ────────────────────────────────────────────────────────────────── + +def main(): + r = redis.from_url(REDIS_URL, db=RATELIMIT_DB, decode_responses=True) + try: + r.ping() + except redis.ConnectionError as exc: + print(f"ERROR: cannot connect to Redis at {REDIS_URL}: {exc}", file=sys.stderr) + sys.exit(1) + + print(f"Redis: {REDIS_URL} DB={RATELIMIT_DB} clear={CLEAR}") + print() + + # ── clear mode ──────────────────────────────────────────────────────── + if CLEAR: + print("=== Clearing synthetic test keys ===") + for ip in TEST_IPS: + delete_pattern(r, f"rl:ip:{ip}:*") + delete_pattern(r, f"rl:{NAMESPACE}:ip:{ip}:*") + for u in TEST_USERS: + delete_pattern(r, f"rl:{NAMESPACE}:user:{u['uid']}:*") + print("Done.") + return + + # ── anonymous IP counters ───────────────────────────────────────────── + print("=== Anonymous IP request counters (DB1) ===") + write(r, key_ip_reqs(TEST_IPS[0]), 20, ANON_WINDOW, "below threshold") + write(r, key_ip_reqs(TEST_IPS[1]), 35, ANON_WINDOW, "AT threshold → middleware will block on next req") + write(r, key_ip_reqs(TEST_IPS[3]), 30, ANON_WINDOW, "below threshold") + print() + + # ── blocked IPs ─────────────────────────────────────────────────────── + print("=== Blocked IPs (DB1) ===") + write(r, key_ip_blocked(TEST_IPS[2]), "1", BLOCK_TTL, "hard-blocked") + print() + + # ── namespace/IP sliding window ─────────────────────────────────────── + print("=== Namespace/IP sliding window (DB1) ===") + bucket = int(time.time() // ANON_WINDOW) + write(r, key_ns_window(NAMESPACE, TEST_IPS[3], bucket), 34, ANON_WINDOW * 2, + "near window threshold (next req triggers ua_rotation block)") + print() + + # ── authenticated user counters ─────────────────────────────────────── + print("=== Authenticated user request counters (DB1) ===") + for u in TEST_USERS: + if not u["blocked"]: + write(r, key_user_reqs(NAMESPACE, u["uid"]), u["reqs"], AUTH_WINDOW, + f"uid={u['uid']} reqs={u['reqs']}") + print() + + # ── blocked users ───────────────────────────────────────────────────── + print("=== Blocked users (DB1) ===") + for u in TEST_USERS: + if u["blocked"]: + write(r, key_user_blocked(NAMESPACE, u["uid"]), "1", BLOCK_TTL, + f"uid={u['uid']} hard-blocked") + print() + + # ── summary ─────────────────────────────────────────────────────────── + all_keys = r.keys("rl:*") + print(f"=== DB{RATELIMIT_DB} now contains {len(all_keys)} rl:* keys ===") + for k in sorted(all_keys): + ttl = r.ttl(k) + val = r.get(k) + print(f" {k!r:55s} val={val!r:5} ttl={ttl}s") + + +if __name__ == "__main__": + main() diff --git a/plan/RATE_LIMITER_PLAN.md b/plan/RATE_LIMITER_PLAN.md new file mode 100644 index 000000000..aa2bd93e9 --- /dev/null +++ b/plan/RATE_LIMITER_PLAN.md @@ -0,0 +1,474 @@ +# SAPL — Kubernetes Redis + +Manifests for the shared Redis instance used by all SAPL pods for +cross-pod rate limiting (DB 1) and view/static-file caching (DB 0). + +--- + +## Directory layout + +``` +docker/k8s/ +└── redis/ + ├── redis-configmap.yaml # redis.conf — no persistence, allkeys-lru, 5 GB ceiling + ├── redis-deployment.yaml # Deployment (1 replica, redis:7-alpine) + └── redis-service.yaml # ClusterIP service on port 6379 +``` + +--- + +## Prerequisites + +- `kubectl` configured to talk to the target cluster. +- A `sapl-redis` namespace (created below if it doesn't exist). + +--- + +## Deploy + +```bash +# 1. Create the namespace (idempotent) +rancher kubectl create namespace sapl-redis --dry-run=client -o yaml | rancher kubectl apply -f - + +# 2. Apply all three manifests +rancher kubectl apply -f docker/k8s/redis/redis-configmap.yaml +rancher kubectl apply -f docker/k8s/redis/redis-deployment.yaml +rancher kubectl apply -f docker/k8s/redis/redis-service.yaml + +# 3. Verify the pod is Running +rancher kubectl -n sapl-redis get pods -l app=sapl-redis +``` + +Expected output: +``` +NAME READY STATUS RESTARTS AGE +sapl-redis-6d9f8b7c4d-xk2lm 1/1 Running 0 30s +``` + +--- + +## Verify the rate limiter + +`scripts/test_ratelimiter.py` fires repeated GET requests at a SAPL URL and reports +when the first 429 is returned. + +### Usage + +``` +python scripts/test_ratelimiter.py [-n NUM] [-d DELAY] [-t TIMEOUT] +``` + +| Flag | Default | Meaning | +|------|---------|---------| +| `url` | *(required)* | Full URL including scheme, e.g. `http://localhost` | +| `-n`, `--num-requests` | `50` | Maximum requests to send | +| `-d`, `--delay` | `0.1` | Seconds between requests | +| `-t`, `--timeout` | `10` | Per-request timeout in seconds | + +The script stops and prints a summary as soon as a 429 is received. + +### Examples + +```bash +# Hit the anonymous threshold (35 req/min) — fire 40 requests with minimal delay +python scripts/test_ratelimiter.py http://localhost -n 40 -d 0.05 + +# Slower fire — check that legitimate traffic is not rate-limited +python scripts/test_ratelimiter.py http://localhost -n 20 -d 2 + +# Test against a staging pod via port-forward +rancher kubectl port-forward -n deploy/sapl 8080:80 & +python scripts/test_ratelimiter.py http://localhost:8080 -n 40 -d 0.05 +``` + +### Reading the output + +``` +Request 1: Status 200 | Time: 0.045s +... +Request 36: Status 429 | Time: 0.038s + -> Rate limited on request 36 + +Summary: + Total requests attempted: 36 + Successful (200): 35 + Rate limited (429): 1 + First 429 occurred at request: 36 +``` + +A first-429 near the configured anonymous threshold (35 req/min) confirms the +middleware is wired correctly. A first-429 much earlier points to nginx `limit_req` +firing before Django sees the request. + +--- + +## Inject REDIS_URL into SAPL instances + +`REDIS_URL` points at the shared instance: + +``` +redis://redis.sapl-redis.svc.cluster.local:6379 + ^^^^^ ^^^^^^^^^^ + svc namespace +``` + +`start.sh` picks it up on every pod startup and sets the `REDIS_CACHE` waffle switch +automatically — no further intervention needed. + +### Fleet-wide rollout + +Uses the `app.kubernetes.io/name=sapl` pod label to discover every SAPL namespace +automatically — onboarding a new municipality requires no script changes. + +```bash +for ns in $(rancher kubectl get pods -A -l app.kubernetes.io/name=sapl \ + -o jsonpath='{.items[*].metadata.namespace}' | tr ' ' '\n' | sort -u); do + rancher kubectl set env deployment/sapl \ + REDIS_URL=redis://redis.sapl-redis.svc.cluster.local:6379 \ + -n $ns +done +``` + +### Roll back + +```bash +for ns in $(rancher kubectl get pods -A -l app.kubernetes.io/name=sapl \ + -o jsonpath='{.items[*].metadata.namespace}' | tr ' ' '\n' | sort -u); do + rancher kubectl set env deployment/sapl REDIS_URL- -n $ns +done +``` + +`kubectl set env deployment/sapl REDIS_URL-` (trailing `-`) removes the variable. +`start.sh` then falls back to file-based cache automatically. + +--- + +## Monitor + +### Pod and events + +```bash +# Pod status +rancher kubectl -n sapl-redis get pods -l app=sapl-redis -o wide + +# Deployment events (useful right after apply) +rancher kubectl -n sapl-redis describe deployment sapl-redis + +# Pod events (OOMKill, restarts, etc.) +rancher kubectl -n sapl-redis describe pod -l app=sapl-redis +``` + +### Logs + +```bash +# Tail live logs +rancher kubectl -n sapl-redis logs -f deploy/sapl-redis + +# Last 100 lines +rancher kubectl -n sapl-redis logs deploy/sapl-redis --tail=100 +``` + +### Redis INFO + +```bash +# Memory usage +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ + redis-cli info memory \ + | grep -E 'used_memory_human|maxmemory_human|mem_fragmentation_ratio' + +# Connection pressure +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ + redis-cli info stats \ + | grep -E 'rejected_connections|instantaneous_ops_per_sec' + +# Key distribution per DB +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli info keyspace + +# Recent slow queries +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli slowlog get 10 + +# Live command sampling (1-second window) +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli --latency-history -i 1 +``` + +### Rate-limiter keys (DB 1) + +```bash +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ + redis-cli -n 1 dbsize + +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ + redis-cli -n 1 --scan --pattern 'rl:ip:*' | head -20 +``` + +--- + +## Seed the UA deny list (once after first deploy) + +`rl:bot:ua:blocked` is a permanent Redis SET in DB 1. Each member is the +SHA-256 of a **UA token** — the identifying fragment extracted after splitting +on `/`, spaces, `;`, `(`, `)`, e.g.: + +``` +UA string: "GPTBot/1.1 (+https://openai.com/gptbot)" +Tokens: GPTBot 1.1 +https: ... +Hash stored: sha256("GPTBot") +``` + +The middleware (`_is_redis_blocked_ua`) tokenises the incoming UA the same +way and checks each token hash against the cached set. The SET is fetched +from Redis at most once per `RATE_LIMITER_UA_BLOCKLIST_REFRESH` seconds (default 60) +per worker process. + +The bots in `BOT_UA_FRAGMENTS` (Python list, always active) and this Redis +SET are **independent** — the Python list provides the baseline and the Redis +SET allows adding new offenders at runtime **without a code deploy**. + +```bash +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:bot:ua:blocked \ + "$(echo -n 'GPTBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'ClaudeBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'PerplexityBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'Bytespider' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'AhrefsBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'meta-externalagent' | sha256sum | cut -d' ' -f1)" + +# Add a new offender at runtime (picked up within RATE_LIMITER_UA_BLOCKLIST_REFRESH seconds) +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:bot:ua:blocked "$(echo -n 'NewBot' | sha256sum | cut -d' ' -f1)" +``` + +--- + +## Local standalone Redis (development / testing) + +No Kubernetes? Run Redis directly with Docker: + +```bash +sudo docker run --rm -p 6379:6379 redis:7-alpine \ + redis-server --save "" --appendonly no +``` + +Then point Django at it by exporting the env var before starting the dev server: + +```bash +export REDIS_URL="redis://localhost:6379" +export CACHE_BACKEND="redis" +python manage.py runserver +``` + +Or add them to your local `.env` file: + +``` +REDIS_URL=redis://localhost:6379 +CACHE_BACKEND=redis +``` + +> **Note**: the waffle switch `REDIS_CACHE` must also be `on` in your local +> database for `start.sh` to activate the Redis backend. Run: +> ```bash +> python manage.py waffle_switch REDIS_CACHE on --create +> ``` + +--- + +## Update `redis.conf` without redeploying + +```bash +# Edit the ConfigMap +rancher kubectl -n sapl-redis edit configmap redis-config + +# Restart the pod to pick up the new config +rancher kubectl -n sapl-redis rollout restart deployment/sapl-redis +``` + +--- + +## Rate limiting — two layers, two jobs + +SAPL enforces rate limits at two independent layers. They use different +algorithms and protect different things; their thresholds must be tuned +separately. + +### Layer 1 — nginx `limit_req` (leaky bucket) + +Defined in `docker/config/nginx/nginx.conf` (zones) and `sapl.conf` (burst). + +``` +sapl_general rate=30r/m # 1 token every 2 s +sapl_heavy rate=10r/m # 1 token every 6 s (PDF/report endpoints) +``` + +`burst=N nodelay` means nginx accepts up to N requests instantly above the +current token level, then enforces the drip rate. Requests beyond the burst +cap return 429 before reaching Gunicorn — **zero Python cost**. + +Burst values are set at container startup via env vars: + +| Env var | Default | Location | +|---------|---------|----------| +| `NGINX_BURST_GENERAL` | `60` | `location /`, `location /media/` | +| `NGINX_BURST_API` | `60` | `location /api/` | +| `NGINX_BURST_HEAVY` | `20` | `location /relatorios/` | + +Defaults are 2× the zone's per-minute rate, so a user can spend a full +minute's quota in a single burst before the leaky bucket takes over. + +### Layer 2 — Django `RateLimitMiddleware` (sliding window) + +Defined in `sapl/middleware/ratelimit.py`, backed by Redis DB 1. + +Requests that pass nginx reach Python. The middleware counts them in a +60-second sliding window per IP (anonymous) or per user (authenticated): + +| Env var | Default | Scope | +|---------|---------|-------| +| `RATE_LIMITER_RATE` | `35/m` | Anonymous IP | +| `RATE_LIMITER_RATE_AUTHENTICATED` | `120/m` | Authenticated user | +| `RATE_LIMITER_RATE_BOT` | `5/m` | *(reserved — bots are currently blocked outright, not counted)* | +| `RATE_LIMITER_UA_BLOCKLIST_REFRESH` | `60` s | How often each worker re-fetches `rl:bot:ua:blocked` from Redis | + +When the window count hits the threshold the IP/user is written to a Redis +blocked-set with a 300 s TTL and subsequent requests return 429 with +`Retry-After: 300` — without touching the database. + +Decision flow inside `RateLimitMiddleware._evaluate()`: + +``` +1. IP in whitelist? → pass (no further checks) +1a. UA matches BOT_UA_FRAGMENTS list? → 429 reason=known_ua +1b. UA token hash in rl:bot:ua:blocked SET? → 429 reason=redis_ua +2. IP in rl:ip:{ip}:blocked? → 429 reason=ip_blocked +3. Authenticated user? + 3a. User in rl:{ns}:user:{uid}:blocked? → 429 reason=user_blocked + 3b. Suspicious headers (no Accept/AL)? → 429 reason=suspicious_headers_auth + 3c. User request count ≥ auth threshold? → SET blocked, 429 reason=auth_user_rate +4. Anonymous: + 4a. Suspicious headers? → 429 reason=suspicious_headers + 4b. IP request count ≥ anon threshold? → SET blocked, 429 reason=ip_rate + 4c. NS/IP window count ≥ anon threshold? → SET blocked, 429 reason=ua_rotation + → pass +``` + +### Why they are not the same number + +| | nginx burst | Django threshold | +|-|------------|-----------------| +| **Algorithm** | Leaky bucket — token refills over time | Sliding window — hard count per 60 s | +| **Protects** | Gunicorn workers from being flooded | Per-client fairness, business policy | +| **Tuned by** | Capacity of the server | Acceptable request volume per client | +| **Failure mode** | Workers overwhelmed | Legitimate user over-browsing | + +A user loading a page quickly may fire 5–10 Django requests in two seconds. +With `rate=30r/m` (1 token/2 s) and `burst=60` they absorb that fine; the +leaky bucket refills before they click the next link. The Django threshold +(35/m sliding window) catches sustained automated traffic from a single IP +that looks like scraping even if it arrives slowly enough to beat the nginx +burst cap. + +--- + +## Request routing — how nginx reaches Django + +`proxy_pass http://sapl_server` forwards the HTTP request — with the original +path intact — to the Gunicorn Unix socket. Django doesn't know or care that +nginx is in front; it sees a standard HTTP request. + +``` +GET /media/foo.pdf + │ + ▼ + nginx (sapl.conf) + location /media/ → proxy_pass to Unix socket + │ + ▼ + Gunicorn (WSGI server) + receives raw HTTP, calls Django WSGI application + │ + ▼ + Django middleware stack (settings.MIDDLEWARE) + RateLimitMiddleware → pass or 429 + │ + ▼ + Django URL router (sapl/urls.py) + r'^media/(?P.*)$' → serve_media + │ + ▼ + serve_media(request, path='foo.pdf') + returns HttpResponse with X-Accel-Redirect: /_accel/media/foo.pdf + │ + ▼ + nginx sees X-Accel-Redirect header + /_accel/media/ internal location → reads file from disk → sends to client +``` + +nginx does no routing beyond picking a `location` block. The mapping from +URL path to Python function lives entirely in `sapl/urls.py`. `proxy_pass` is +just a pipe. + +--- + +## Media file serving — `serve_media` and X-Accel-Redirect + +All `/media/` requests (public and private) are routed through Gunicorn so that +Django middleware runs on every hit. Nginx serves the file bytes via +`X-Accel-Redirect` — the Gunicorn worker is freed as soon as it sends the +response headers. + +### nginx locations (`docker/config/nginx/sapl.conf`) + +```nginx +# Proxied to Gunicorn — Django middleware + serve_media() run here. +location /media/ { + limit_req zone=sapl_general burst=${NGINX_BURST_GENERAL} nodelay; + proxy_pass http://sapl_server; +} + +# Internal — only reachable via X-Accel-Redirect, not by external clients. +location /_accel/media/ { + internal; + alias /var/interlegis/sapl/media/; + sendfile on; + etag on; +} +``` + +### Django view (`sapl/base/media.py`) + +`serve_media(request, path)` — registered at `^media/(?P.*)$` in `sapl/urls.py`. + +Per-request steps: + +1. **Path traversal guard** — `os.path.abspath` check; raises 404 on escape. +2. **Auth gate** — `documentos_privados/` paths require an authenticated session; redirects to login otherwise. +3. **Path counter** — increments `rl:{ns}:path:{sha256}:reqs` in Redis DB 1 (TTL = `MEDIA_PATH_COUNTER_TTL`). +4. **Content-type cache** — reads `file:{ns}:{sha256}` from Django default cache (DB 0); on miss, calls `mimetypes.guess_type`, stores result (TTL = `MEDIA_FILE_CACHE_TTL`). +5. **Serve** — in DEBUG: `django.views.static.serve` directly. In production: `X-Accel-Redirect: /_accel/media/`. + +### Settings + +| Setting | Default | Purpose | +|---------|---------|---------| +| `FILE_META_KEY` | `'file:{ns}:{sha256}'` | Key template for content-type cache (DB 0) | +| `MEDIA_PATH_COUNTER_TTL` | `60` s | Per-path counter window | +| `MEDIA_FILE_CACHE_TTL` | `3600` s | Content-type metadata TTL | + +--- + +## Key schema reference + +| DB | Use case | Key pattern | TTL | Constant | +|----|----------|-------------|-----|----------| +| 0 | Page / view cache | `cache:{ns}:*` | 300 s (default) | `CACHES['default']` KEY_PREFIX | +| 0 | Static file cache (logos) | `static:{ns}:{sha256}` | 3 – 24 h | *Future* (requires OpenResty/Lua) | +| 0 | Media file content-type cache | `file:{ns}:{sha256}` | 1 h | `FILE_META_KEY` | +| 1 | IP rate-limit counter | `rl:ip:{ip}:reqs` | 60 s | `RL_IP_REQUESTS` | +| 1 | IP blocked marker | `rl:ip:{ip}:blocked` | 300 s | `RL_IP_BLOCKED` | +| 1 | User rate-limit counter | `rl:{ns}:user:{uid}:reqs` | 60 s | `RL_USER_REQUESTS` | +| 1 | User blocked marker | `rl:{ns}:user:{uid}:blocked` | 300 s | `RL_USER_BLOCKED` | +| 1 | Namespace/IP sliding window | `rl:{ns}:ip:{ip}:w:{bucket}` | 120 s | `RL_NS_WINDOW` | +| 1 | Path counter (`/media/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | `RL_PATH_REQUESTS` | +| 1 | Path counter (`/static/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | *Future* (requires OpenResty/Lua) | +| 1 | UA deny list | `rl:bot:ua:blocked` | permanent SET | `RL_UA_BLOCKLIST` | +| 2 | Django Channels | `channels:*` | session TTL | *Future* | diff --git a/plan/rate-limiter-v2.md b/plan/rate-limiter-v2.md new file mode 100644 index 000000000..ecbfcc3fd --- /dev/null +++ b/plan/rate-limiter-v2.md @@ -0,0 +1,1231 @@ +# SAPL — OOM Investigation & Remediation Plan (v2) + +> **Scope**: Django 2.2 / Gunicorn / nginx / Kubernetes fleet of 1,200+ pods. +> Each pod has a dedicated PostgreSQL instance. A K8s Ingress sits in front of all tenants. +> **This document is canonical** — all earlier session notes are consolidated here. + +--- + +## Table of Contents + +1. [Architecture Overview](#0-architecture-overview) +2. [Context & Problem Statement](#1-context--problem-statement) +3. [Decision Log](#2-decision-log) +4. [Phase 0 — Immediate Hardening (No New Infra)](#3-phase-0--immediate-hardening-no-new-infra) +5. [Phase 1 — Shared Redis (Single Pod)](#4-phase-1--shared-redis-single-pod) +6. [Phase 2 — Rate Limiting & Bot Mitigation](#5-phase-2--rate-limiting--bot-mitigation) +7. [Phase 3 — File Serving Corrections](#6-phase-3--file-serving-corrections) +8. [Phase 4 — Dynamic Page Caching](#7-phase-4--dynamic-page-caching) +9. [Open Questions](#8-open-questions) + +--- + +## 0. Architecture Overview + +### 0.1 Component Diagram + +```mermaid +graph TD + Client([Bot / Human Client]) + nginx[nginx\nDebian pkg] + gunicorn[Gunicorn\n2 workers / 4 threads] + mw[Django Middleware\nRateLimitMiddleware] + view[View Layer\nCBV + decorators] + redis[(Redis\nDB0: cache\nDB1: rate limiter)] + pg[(PostgreSQL\nper-pod)] + fs[Filesystem\nPDFs / media] + + Client -->|HTTP| nginx + nginx -->|proxy_pass| gunicorn + gunicorn --> mw + mw -->|pass| view + mw -->|429| nginx + view --> pg + view --> fs + view --> redis + mw --> redis + nginx -->|SISMEMBER / GET| redis +``` + +> DB2 is reserved for Django Channels (WebSocket — future Phase 5). + +### 0.2 Redis Memory Budget and Key Layout + +| Key type | Key schema | TTL | DB | Est. size | +|---|---|---|---|---| +| Page / view cache | `cache:{ns}:` | 60–600 s | 0 | ~0.5 GB | +| Static cache (images/logos) | `cache:{ns}:static:{sha256}` | 3–24 h | 0 | ~2.4 GB | +| PDF cache (≤ 360 KB) | `cache:{ns}:file:{sha256}` | 1 h | 0 | ~0.9 GB | +| IP request counter | `rl:ip:{ip}:reqs` | 60 s | 1 | ~0.6 MB | +| IP blocked marker | `rl:ip:{ip}:blocked` | 300 s | 1 | ~0.06 MB | +| User request counter | `rl:{ns}:user:{id}:reqs` | 60 s | 1 | negligible | +| User blocked marker | `rl:{ns}:user:{id}:blocked` | 300 s | 1 | negligible | +| Path counter | `rl:{ns}:path:{sha256}:reqs` | 60 s | 1 | ~0.3 MB | +| UA deny list | `rl:bot:ua:blocked` | permanent SET | 1 | ~0.03 MB | +| NS/IP/window counter | `rl:ns:{ns}:ip:{ip}:w:{bucket}` | 60 s × 2 | 1 | ~0.6 MB | +| **Redis overhead (× 1.5)** | | | | ~1.6 GB | +| **Total ceiling** | | | | **~5 GB** | + +**Key conventions:** +- `{ns}` = Kubernetes namespace (tenant identifier). All path and user keys include it. +- `{user}` / `{id}` = normalized user PK: `str(user.pk).lower().strip()`. +- Django `CACHES` uses `KEY_PREFIX: "cache:{ns}"` (e.g. `cache:sapl:`) to namespace all DB0 cache keys. + DB1 (rate limiter) uses raw `rl:*` keys — no prefix — for compatibility with the Lua / middleware INCR scripts. +- DB2 is reserved for Django Channels; allocate separately when WebSocket work resumes. + +--- + +## 1. Context & Problem Statement + +### Fleet + +| Item | Detail | +|---|---| +| System | SAPL — Django 2.2, legislative management for Brazilian municipal chambers | +| Fleet | ~1,200 Kubernetes pods, each with a dedicated PostgreSQL pod | +| Pod limits | 1 core CPU (limit) / 35m (request) · 1600Mi RAM (limit) / 800Mi (request) | +| Users | Legislative house staff, often behind NAT (many users, one public IP) | +| Workloads | PDF generation (synchronous, ReportLab), file uploads up to 150 MB, WebSocket voting panel | + +### OOM Kill Pattern + +Workers grow from ~35 MB at birth to 800–900 MB within 2–3 minutes, then are killed and replaced in a continuous cycle. + +Root causes: +- Bot scraping triggers synchronous PDF generation — entire document built in RAM (ReportLab) +- `worker_max_memory_per_child` only checks **between requests**; workers blocked on long requests are never recycled +- `TIMEOUT=300` lets bots hold threads for up to 5 minutes while memory accumulates +- 3 workers × 300 MB each = ~900 MB — breaching the 800Mi request threshold + +### Bot Traffic Profile (Barueri pod, 16 days, 662k requests) + +| Actor | Requests | % of total | +|---|---|---| +| Googlebot | ~154,000 | 23.2% | +| Chrome/98.0.4758 (spoofed scraper) | 90,774 | 13.7% | +| kube-probe (healthcheck) | 69,065 | 10.4% | +| meta-externalagent | 28,325 | 4.3% | +| GPTBot | 11,489 | 1.7% | +| bingbot | 7,639 | 1.1% | +| OAI-SearchBot + Applebot | 6,681 | 1.0% | +| **Total identified bots** | **~377,000** | **~56.9%** | + +**Botnet fingerprint:** +- Rotates User-Agents (Chrome/121, Chrome/122, Firefox/123, Safari/17…) across requests +- Crawls all sub-endpoints of the same matéria within 1 second from different IPs +- Distributes crawling across tenants — each pod stays under the per-pod rate limit, never triggering it +- Primary targets: `/relatorios/{id}/etiqueta-materia-legislativa` (~40 KB PDF) and all `/materia/{id}/*` sub-endpoints + +### Static File Traffic (from CSV analysis) + +| Category | Requests | Transfers | +|---|---|---| +| Logos / images | 62,776 | ~24 GB | +| PDFs | 8,869 | 5.1 GB | +| Parliamentarian photos | 11,856 | ~0.5 GB | +| **Total** | **83,501** | **~30 GB** | + +Top offender: `Brasão - Foz do Iguaçu.png` — 14,512 requests, 5.6 GB from a single 392 KB file. + +### Confirmed Bugs + +```nginx +# nginx.conf — WRONG (disables kernel bypass) +sendfile off; + +# sapl.conf — missing on /media/ location +location /media/ { + alias /var/interlegis/sapl/media/; + # no ETag, no Cache-Control, no X-Robots-Tag +} +``` + +```python +# settings.py — per-pod cache, not shared +CACHES = { + 'default': { + 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', + 'LOCATION': '/var/tmp/django_cache', + 'OPTIONS': {"MAX_ENTRIES": 10000}, + } +} +``` + +Django rate limiter (`django-ratelimit` at 35/m) uses `FileBasedCache` — counters are isolated per pod, making rate limiting completely ineffective at fleet scale. + +### Hard Constraints + +| Constraint | Impact | +|---|---| +| Per-pod PostgreSQL | Rate-limit counters not shared across pods | +| No Redis initially | No shared state for rate limiting or caching | +| NAT environments | IP-based rate limiting causes false positives | +| `TIMEOUT=300` / uploads to 150 MB | Must not be broken — intentional for slow workflows | + +--- + +## 2. Decision Log + +| Decision | Chosen | Rationale | Session | +|---|---|---|---| +| Redis topology | **Single pod** (no Sentinel, no Cluster) | 65 MB of active data fits comfortably on one node; cluster complexity not justified at this data volume | v2 | +| PDF caching in Redis | **No** — ETags + sendfile are sufficient | Once rate limiting + ETags are active, repeat requests become 304s with zero bytes transferred | Session 4 | +| nginx rate-limit end state | **Django middleware** with shared Redis | No nginx image changes required; solves cross-pod consistency immediately | Session 5 | +| `worker_max_memory_per_child` | **400 MB** | Pod limit 1600Mi, 2 workers × 400 MB = 800 MB — leaves 800 Mi headroom; previous 300 MB was OOMKilled before recycling could act | v2 | +| `sendfile off` | **Bug** — flip to `on` | No valid production reason found in uploaded config; disabling userspace copy is always correct | Session 5 | +| nginx serves `/media/` directly | Confirmed via `alias` in `sapl.conf` | `X-Accel-Redirect` only needed for LGPD-restricted documents | Session 5 | +| Cache backend switch timing | **At pod startup** via `start.sh` + waffle switch | Pod restart is acceptable; avoids per-request runtime overhead | Session 5 | +| Secret injection | Per-namespace Secret with `optional: true` | Enables gradual rollout; pod starts on file cache if Secret is absent | Session 5 | +| Redis k8s files location | `$PROJECT_ROOT/docker/k8s/` | Consistent with existing Docker artifacts in the repo | v2 | + +--- + +## 3. Phase 0 — Immediate Hardening (No New Infra) + +**Goal**: Stop the OOM kill cycle and reduce bot load with zero infrastructure additions. +**Risk**: Low — all changes are config-only. + +### 3.1 Gunicorn Tuning + +The core tension: reducing workers protects memory but reduces concurrency. The fix is to reduce the **number** of workers (from 3 to 2) and raise the per-worker **ceiling** so the recycling mechanism has time to act. + +```python +# docker/startup_scripts/gunicorn.conf.py +import os +import pathlib + +NAME = "SAPL" +DJANGODIR = "/var/interlegis/sapl" +SOCKFILE = f"unix:{DJANGODIR}/run/gunicorn.sock" +USER = "sapl" +GROUP = "nginx" + +NUM_WORKERS = int(os.getenv("WEB_CONCURRENCY", "2")) # was 3 +THREADS = int(os.getenv("GUNICORN_THREADS", "4")) # was 8 +TIMEOUT = int(os.getenv("GUNICORN_TIMEOUT", "120")) # was 300 +WORKER_CLASS = "gthread" +DJANGO_SETTINGS = "sapl.settings" +WSGI_APP = "sapl.wsgi:application" + +proc_name = NAME +bind = SOCKFILE +umask = 0o007 +user = USER +group = GROUP +chdir = DJANGODIR +wsgi_app = WSGI_APP + +loglevel = "info" # was debug — reduces log I/O +accesslog = "/var/log/sapl/access.log" +errorlog = "/var/log/sapl/error.log" +capture_output = True + +workers = NUM_WORKERS +worker_class = WORKER_CLASS +threads = THREADS +timeout = TIMEOUT +graceful_timeout = 30 +keepalive = 10 +backlog = 2048 + +max_requests = 1000 +max_requests_jitter = 200 +worker_max_memory_per_child = 400 * 1024 * 1024 # 400 MB — was 300 MB + +raw_env = [f"DJANGO_SETTINGS_MODULE={DJANGO_SETTINGS}"] +preload_app = False + +def on_starting(server): + pathlib.Path(SOCKFILE).parent.mkdir(parents=True, exist_ok=True) + +def post_fork(server, worker): + try: + from django import db + db.connections.close_all() + except Exception: + pass +``` + +**Per-location timeout strategy** — replace the one-size-fits-all 300s: + +| Operation | Previous | Recommended | Rationale | +|---|---|---|---| +| Normal page rendering | 300 s | 60 s | No legitimate page should take > 60 s | +| API endpoints | 300 s | 30 s | Stateless, fast by design | +| PDF download (cached / nginx) | 300 s | 30 s | nginx serves from disk, worker not involved | +| PDF generation (uncached) | 300 s | 180 s | Kept high — addressed in Phase 5 | +| Large file upload | 300 s | 180 s | nginx buffers upload, worker processes after | + +--- + +### 3.2 nginx Fixes + +Three confirmed bugs in the uploaded config — all fixed here. + +```nginx +# /etc/nginx/nginx.conf — http {} block + +# FIX 1: kernel bypass (was off — CRITICAL) +sendfile on; +tcp_nopush on; +tcp_nodelay on; + +# FIX 2: reduced timeouts (was 300s everywhere) +keepalive_timeout 75; +proxy_read_timeout 120s; # overridden per-location for slow ops +proxy_connect_timeout 10s; +proxy_send_timeout 120s; + +# Real client IP from X-Forwarded-For set by K8s Ingress +real_ip_header X-Forwarded-For; +real_ip_recursive on; +set_real_ip_from 10.0.0.0/8; +set_real_ip_from 172.16.0.0/12; +set_real_ip_from 192.168.0.0/16; +``` + +```nginx +# sapl.conf — FIX 3: add caching headers to /media/ +location /media/ { + alias /var/interlegis/sapl/media/; + sendfile on; + etag on; + add_header Cache-Control "public, max-age=86400, stale-while-revalidate=3600"; + add_header X-Robots-Tag "noindex" always; +} +``` + +**Upload endpoints** — keep `proxy_request_buffering on` so nginx absorbs slow uploads before handing off to Gunicorn: + +```nginx +location ~* ^/(protocoloadm/criar-protocolo|materia/.*upload|norma/.*upload) { + proxy_request_buffering on; + proxy_read_timeout 180s; + proxy_send_timeout 180s; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Host $http_host; + proxy_redirect off; + proxy_pass http://sapl_server; +} +``` + +--- + +### 3.3 Bot UA Blocklist in nginx + +Blocks known bots at nginx — before any Gunicorn worker is allocated. + +```nginx +# nginx.conf — http {} block +map $http_user_agent $bot_ua_blocked { + default 0; + "~*GPTBot" 1; + "~*ClaudeBot" 1; + "~*PerplexityBot" 1; + "~*Bytespider" 1; + "~*AhrefsBot" 1; + "~*SemrushBot" 1; + "~*DotBot" 1; + "~*meta-externalagent" 1; + "~*OAI-SearchBot" 1; + "~*Chrome/98\.0\.4758" 1; # confirmed scraper — no real user runs a 2022 browser in 2026 +} + +# sapl.conf — server {} block (before any location) +if ($bot_ua_blocked = 1) { + return 429 "Too Many Requests"; +} +``` + +**Limitation**: Bots with rotating or spoofed UAs are not caught here. They are handled by Django middleware in Phase 2 (checks 3–5). This is intentional — nginx handles the cheap deterministic case; Django handles the expensive probabilistic case. + +--- + +### 3.4 ASN-Based Blocking (Mandatory) + +Blocks bot traffic by datacenter ASN — before UA parsing, before any Python process is touched. + +**Step 1 — install the GeoIP2 module and database:** + +```bash +# Debian / Ubuntu +apt install libnginx-mod-http-geoip2 libmaxminddb0 mmdb-bin + +# Download GeoLite2-ASN (free MaxMind account required) +mkdir -p /etc/nginx/geoip +curl -sL "https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-ASN&license_key=YOUR_KEY&suffix=tar.gz" \ + | tar -xz --strip-components=1 --wildcards '*.mmdb' -C /etc/nginx/geoip/ +``` + +**Step 2 — configure nginx:** + +```nginx +# nginx.conf — top-level (outside http {}) +load_module modules/ngx_http_geoip2_module.so; + +# nginx.conf — http {} block +geoip2 /etc/nginx/geoip/GeoLite2-ASN.mmdb { + $geoip2_asn_number autonomous_system_number; + $geoip2_asn_org autonomous_system_organization; +} + +map $geoip2_asn_number $bot_asn { + default 0; + 16509 1; # Amazon AWS + 14618 1; # Amazon AWS us-east + 8075 1; # Microsoft Azure + 396982 1; # Google Cloud + 20473 1; # Vultr + 24940 1; # Hetzner + 16276 1; # OVH + 36352 1; # ColoCrossing + 63949 1; # Linode / Akamai +} + +# sapl.conf — server {} block (before bot_ua_blocked check) +if ($bot_asn = 1) { + return 429 "Too Many Requests"; +} +``` + +**Step 3 — keep the database fresh** (host cron — no k8s CronJob): + +```bash +# /etc/cron.weekly/update-geoip +#!/bin/bash +curl -sL "https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-ASN&license_key=${MAXMIND_KEY}&suffix=tar.gz" \ + | tar -xz -C /tmp --wildcards '*.mmdb' +mv /tmp/GeoLite2-ASN_*/GeoLite2-ASN.mmdb /etc/nginx/geoip/GeoLite2-ASN.mmdb +nginx -s reload +``` + +**Tradeoff**: Blocks datacenter ASNs where bots originate. May over-block VPN users and developers on cloud instances — mitigate with a per-namespace IP whitelist once available (see Open Question 2). + +--- + +### 3.5 robots.txt + +Passive mitigation — effective over days/weeks for compliant bots. The spoofed Chrome/98 botnet ignores it; handled by nginx UA blocking above. + +``` +# Place at /var/interlegis/sapl/collected_static/robots.txt +User-agent: GPTBot +Disallow: / +Crawl-delay: 10 + +User-agent: ClaudeBot +Disallow: / +Crawl-delay: 10 + +User-agent: meta-externalagent +Disallow: / +Crawl-delay: 10 + +User-agent: OAI-SearchBot +Disallow: / +Crawl-delay: 10 + +User-agent: * +Disallow: /relatorios/ +Crawl-delay: 10 +``` + +Serve directly from nginx (no Django involvement): + +```nginx +# sapl.conf +location = /robots.txt { + alias /var/interlegis/sapl/collected_static/robots.txt; +} +``` + +--- + +### 3.6 N+1 Fix in `get_etiqueta_protocolos` + +Confirmed in `sapl/protocoloadm/utils.py` — `MateriaLegislativa.objects.filter()` called inside a loop over protocols. Two queries total regardless of volume: + +```python +# BEFORE — one query per protocol (N+1) +def get_etiqueta_protocolos(prots): + protocolos = [] + for p in prots: + dic = {} + for materia in MateriaLegislativa.objects.filter( + numero_protocolo=p.numero, ano=p.ano): + dic['num_materia'] = ( + materia.tipo.sigla + ' ' + + str(materia.numero) + '/' + str(materia.ano) + ) + protocolos.append(dic) + return protocolos + + +# AFTER — two queries total regardless of volume +def get_etiqueta_protocolos(prots): + from django.db.models import Q + import functools, operator + + prot_list = list(prots) + if not prot_list: + return [] + + query = functools.reduce( + operator.or_, + [Q(numero_protocolo=p.numero, ano=p.ano) for p in prot_list] + ) + materias_map = { + (m.numero_protocolo, m.ano): m + for m in MateriaLegislativa.objects.filter(query).select_related('tipo') + } + + protocolos = [] + for p in prot_list: + dic = {} + materia = materias_map.get((p.numero, p.ano)) + dic['num_materia'] = ( + f"{materia.tipo.sigla} {materia.numero}/{materia.ano}" + if materia else '' + ) + # ... rest of existing loop body unchanged + protocolos.append(dic) + return protocolos +``` + +--- + +### 3.7 ETags / 304 Responses + +Adding `etag on` and `Cache-Control` to the `/media/` location (§3.2) converts repeat bot requests from full downloads to 304 responses with empty bodies. + +For `Brasão - Foz do Iguaçu.png` (392 KB × 14,512 requests = **5.6 GB**), even a 50% conditional hit rate saves ~2.8 GB immediately — without any Redis. + +**Why this is sufficient for PDFs**: See Phase 3 §6.2. + +--- + +### 3.8 Django Upload Settings + +```python +# sapl/settings.py +# Files above 2 MB are streamed to a temp file on disk rather than +# held in worker RAM. Critical for 150 MB upload support. +FILE_UPLOAD_MAX_MEMORY_SIZE = 2 * 1024 * 1024 # 2 MB +DATA_UPLOAD_MAX_MEMORY_SIZE = 10 * 1024 * 1024 # 10 MB +MAX_DOC_UPLOAD_SIZE = 150 * 1024 * 1024 # 150 MB +FILE_UPLOAD_TEMP_DIR = '/var/interlegis/sapl/tmp' +``` + +--- + +## 4. Phase 1 — Shared Redis (Single Pod) + +**Goal**: Deploy Redis so all subsequent phases have shared state. +**Risk**: Medium — new stateful infrastructure. Non-fatal fallback to file cache if Redis is unreachable. + +### 4.1 Redis Kubernetes Manifests + +Files live under `$PROJECT_ROOT/docker/k8s/`. + +```yaml +# docker/k8s/redis-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: redis-config + namespace: redis +data: + redis.conf: | + save "" + appendonly no + + maxmemory 5gb + maxmemory-policy allkeys-lru + maxmemory-samples 10 + + maxclients 20000 + tcp-backlog 511 + timeout 300 + tcp-keepalive 60 + + hz 20 + lazyfree-lazy-eviction yes + lazyfree-lazy-expire yes + lazyfree-lazy-server-del yes + + slowlog-log-slower-than 10000 + slowlog-max-len 256 + latency-monitor-threshold 10 + + bind 0.0.0.0 + protected-mode no + databases 4 # DB0: cache, DB1: rate limiter, DB2: channels (future) + + activedefrag yes + active-defrag-ignore-bytes 100mb + active-defrag-threshold-lower 10 +``` + +```yaml +# docker/k8s/redis-pod.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sapl-redis + namespace: redis +spec: + replicas: 1 + selector: + matchLabels: + app: sapl-redis + template: + metadata: + labels: + app: sapl-redis + spec: + containers: + - name: redis + image: redis:7-alpine + command: ["redis-server", "/etc/redis/redis.conf"] + resources: + requests: + memory: "1Gi" + cpu: "250m" + limits: + memory: "6Gi" + cpu: "1000m" + ports: + - containerPort: 6379 + volumeMounts: + - name: redis-config + mountPath: /etc/redis + volumes: + - name: redis-config + configMap: + name: redis-config +``` + +```yaml +# docker/k8s/redis-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: sapl-redis + namespace: redis +spec: + selector: + app: sapl-redis + ports: + - port: 6379 + targetPort: 6379 +``` + +**Pod budget rationale:** + +| Data type | Estimated memory | +|---|---| +| Rate limit counters (all pods, all IPs) | ~50–110 MB | +| View / template cache | ~300–600 MB | +| Small file cache (logos, etiquetas) | ~500 MB–1 GB | +| Redis overhead (× 1.5) | ~1.6 GB | +| **Total ceiling** | **~5 GB** | + +--- + +### 4.2 Use-Case / Key-Prefix Mapping + +| Use case | Key prefix | DB | TTL | Notes | +|---|---|---|---|---| +| Page / view cache | `cache:{ns}:*` | 0 | 60–600 s | `KEY_PREFIX=cache:{ns}` in Django CACHES | +| Static file cache (logos) | `cache:{ns}:static:{sha256}` | 0 | 3–24 h | ns = POD_NAMESPACE | +| PDF cache (≤ 360 KB) | `cache:{ns}:file:{sha256}` | 0 | 1 h | ns required | +| Rate limiter counters | `rl:*` | 1 | 60–300 s | Raw keys, no prefix | +| UA deny list | `rl:bot:ua:blocked` | 1 | permanent SET | Seed once after deploy | +| WebSocket / Channels | `channels:*` | 2 | session TTL | **Future — Phase 5** | + +--- + +### 4.3 Django Settings — Startup-Time Backend Selection + +```python +# sapl/settings.py +REDIS_URL = config('REDIS_URL', default='') +CACHE_BACKEND = config('CACHE_BACKEND', default='file') + +_redis_ready = CACHE_BACKEND == 'redis' and bool(REDIS_URL) + +CACHES = { + 'default': { + 'BACKEND': ( + 'django_redis.cache.RedisCache' if _redis_ready + else 'django.core.cache.backends.filebased.FileBasedCache' + ), + 'LOCATION': REDIS_URL + '/0' if _redis_ready else '/var/tmp/django_cache', + 'KEY_PREFIX': f'cache:{POD_NAMESPACE}', # e.g. "cache:sapl:" or "cache:patobranco-pr:" + **( + { + 'OPTIONS': { + 'CLIENT_CLASS': 'django_redis.client.DefaultClient', + 'CONNECTION_POOL_KWARGS': { + # 1,200 pods × 2 workers × 6 = 14,400 peak connections + # maxclients=20,000 gives 40% headroom + 'max_connections': 6, + 'socket_timeout': 0.5, + 'socket_connect_timeout': 0.5, + }, + 'IGNORE_EXCEPTIONS': True, # cache miss on Redis failure — app degrades gracefully + }, + 'TIMEOUT': 300, + } if _redis_ready else { + 'OPTIONS': {'MAX_ENTRIES': 10000}, + } + ), + }, + 'ratelimit': { + 'BACKEND': 'django_redis.cache.RedisCache', + 'LOCATION': REDIS_URL + '/1' if _redis_ready else '', + 'OPTIONS': { + 'CLIENT_CLASS': 'django_redis.client.DefaultClient', + 'CONNECTION_POOL_KWARGS': { + 'max_connections': 6, + 'socket_timeout': 0.5, + 'socket_connect_timeout': 0.5, + }, + 'IGNORE_EXCEPTIONS': True, + }, + } if _redis_ready else { + 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', + 'LOCATION': '/var/tmp/django_ratelimit_cache', + 'OPTIONS': {'MAX_ENTRIES': 5000}, + }, +} + +RATELIMIT_USE_CACHE = 'ratelimit' +``` + +`start.sh` additions — resolve URL and read waffle switch before Gunicorn starts: + +```bash +resolve_redis_url() { + # 1. Already set by local Secret via envFrom — highest precedence + [[ -n "${REDIS_URL:-}" ]] && { log "REDIS_URL from local secret."; return 0; } + + # 2. Try global cluster Secret via k8s API + local api="https://kubernetes.default.svc" + local token ca + token="$(<'/var/run/secrets/kubernetes.io/serviceaccount/token')" + ca="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + + local url + url=$(curl -sf --cacert "$ca" \ + -H "Authorization: Bearer $token" \ + "${api}/api/v1/namespaces/interlegis-infra/secrets/sapl-global-redis" \ + | python3 -c " +import sys, json, base64 +d = json.load(sys.stdin).get('data', {}) +v = d.get('REDIS_URL', '') +print(base64.b64decode(v).decode() if v else '') +" 2>/dev/null || echo "") + + if [[ -n "$url" ]]; then + export REDIS_URL="$url" + log "REDIS_URL from global cluster secret." + return 0 + fi + log "No REDIS_URL found — file-based cache will be used." +} + +resolve_cache_backend() { + [[ -z "${REDIS_URL:-}" ]] && return 0 + log "REDIS_URL set — checking REDIS_CACHE waffle switch..." + local active + active=$(psql "$DATABASE_URL" -At -v ON_ERROR_STOP=0 -c \ + "SELECT active FROM waffle_switch WHERE name='REDIS_CACHE' LIMIT 1;" \ + 2>/dev/null || echo "") + if [[ "$active" == "t" ]]; then + log "REDIS_CACHE switch ON — activating Redis cache backend." + export CACHE_BACKEND="redis" + else + log "REDIS_CACHE switch OFF — using file-based cache." + export CACHE_BACKEND="file" + fi +} + +wait_for_redis() { + [[ -z "${REDIS_URL:-}" ]] && return 0 + log "Checking Redis connectivity..." + local host port + host=$(python3 -c "from urllib.parse import urlparse; u=urlparse('${REDIS_URL}'); print(u.hostname or 'localhost')") + port=$(python3 -c "from urllib.parse import urlparse; u=urlparse('${REDIS_URL}'); print(u.port or 6379)") + local retries=10 + until python3 -c "import socket; s=socket.create_connection(('${host}',${port}),2); s.close()" 2>/dev/null; do + retries=$((retries-1)) + [[ $retries -eq 0 ]] && { log "WARNING: Redis unreachable — continuing on file cache."; return 0; } + log "Waiting for Redis... ($retries retries left)" + sleep 2 + done + log "Redis reachable at ${host}:${port}." +} + +configure_redis_cache() { + [[ -z "${REDIS_URL:-}" ]] && return 0 + log "Creating REDIS_CACHE waffle switch (default: off)" + python3 manage.py waffle_switch REDIS_CACHE off --create +} +``` + +--- + +### 4.4 Rollout Sequence + +```bash +# Enable Redis for one namespace +kubectl create secret generic sapl-redis \ + --namespace=fortaleza-ce \ + --from-literal=REDIS_URL="redis://sapl-redis.redis.svc.cluster.local:6379" \ + --dry-run=client -o yaml | kubectl apply -f - + +kubectl exec -n fortaleza-ce deploy/sapl -- \ + python manage.py waffle_switch REDIS_CACHE on --create + +kubectl rollout restart deployment/sapl -n fortaleza-ce + +# Disable without removing secret +kubectl exec -n fortaleza-ce deploy/sapl -- \ + python manage.py waffle_switch REDIS_CACHE off +kubectl rollout restart deployment/sapl -n fortaleza-ce + +# Fleet-wide rollout (parallel) +kubectl get namespaces -l app=sapl -o name | sed 's|namespace/||' | \ + xargs -P 10 -I{} kubectl exec -n {} deploy/sapl -- \ + python manage.py waffle_switch REDIS_CACHE on --create + +kubectl get namespaces -l app=sapl -o name | sed 's|namespace/||' | \ + xargs -P 5 -I{} kubectl rollout restart deployment/sapl -n {} +``` + +**Seed the UA deny list once after Redis is deployed:** + +```bash +kubectl exec -n redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:bot:ua:blocked \ + "$(echo -n 'GPTBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'ClaudeBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'PerplexityBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'Bytespider' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'AhrefsBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'meta-externalagent' | sha256sum | cut -d' ' -f1)" + +# Add new offenders at runtime without restart +kubectl exec -n redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:bot:ua:blocked "$(echo -n 'NewBot/1.0' | sha256sum | cut -d' ' -f1)" +``` + +**Production monitoring commands:** + +```bash +# Memory usage +kubectl exec -n redis deploy/sapl-redis -- redis-cli info memory \ + | grep -E 'used_memory_human|maxmemory_human|mem_fragmentation_ratio' + +# Connection pressure +kubectl exec -n redis deploy/sapl-redis -- redis-cli info stats \ + | grep -E 'rejected_connections|instantaneous_ops_per_sec' + +# Key distribution per DB +kubectl exec -n redis deploy/sapl-redis -- redis-cli info keyspace + +# Slow log +kubectl exec -n redis deploy/sapl-redis -- redis-cli slowlog get 25 +``` + +--- + +### 4.5 Inspecting Redis State + +#### CLI quick-reference (redis-cli or `kubectl exec`) + +```bash +# ── Connection ───────────────────────────────────────────────────────────── +# docker-compose +redis-cli -h localhost -p 6379 + +# k8s pod +kubectl exec -n deploy/sapl-redis -- redis-cli + +# ── DB selection (always specify -n for rate-limiter work) ───────────────── +# DB0 = page cache DB1 = rate limiter DB2 = channels (future) +redis-cli -n 1 # select DB1 + +# ── Key inspection ───────────────────────────────────────────────────────── +# List all rate-limiter keys +redis-cli -n 1 KEYS "rl:*" + +# Request counter for a specific IP +redis-cli -n 1 GET "rl:ip:203.0.113.1:reqs" + +# Remaining TTL on a counter +redis-cli -n 1 TTL "rl:ip:203.0.113.1:reqs" + +# Check if an IP is hard-blocked +redis-cli -n 1 EXISTS "rl:ip:203.0.113.1:blocked" + +# Authenticated user counter (ns = POD_NAMESPACE, uid = user pk) +redis-cli -n 1 GET "rl:sapl:user:42:reqs" + +# Namespace/IP sliding window (bucket = epoch // 60) +redis-cli -n 1 KEYS "rl:sapl:ip:203.0.113.1:w:*" + +# ── Manual block / unblock ───────────────────────────────────────────────── +# Block an IP for 5 minutes +redis-cli -n 1 SET "rl:ip:1.2.3.4:blocked" 1 EX 300 + +# Immediately unblock an IP +redis-cli -n 1 DEL "rl:ip:1.2.3.4:blocked" + +# Unblock a user +redis-cli -n 1 DEL "rl:sapl:user:42:blocked" + +# ── Aggregate stats ──────────────────────────────────────────────────────── +# Count all blocked IPs right now +redis-cli -n 1 KEYS "rl:ip:*:blocked" | wc -l + +# Count all blocked users +redis-cli -n 1 KEYS "rl:*:user:*:blocked" | wc -l + +# Total DB1 key count +redis-cli -n 1 DBSIZE + +# Memory used by DB1 +redis-cli INFO keyspace | grep "db1" + +# ── Cache DB inspection (DB0) ─────────────────────────────────────────────── +# Count cached page responses (KEY_PREFIX = cache:{ns}, e.g. "cache:sapl:") +redis-cli -n 0 KEYS "cache:sapl:*" | wc -l + +# Memory used by DB0 +redis-cli INFO keyspace | grep "db0" +``` + +#### RedisInsight + +Open `http://localhost:5540` (or whatever port you mapped) and connect to: +- **Host**: `localhost` (or the k8s service name) +- **Port**: `6379` +- **Database**: switch between DB0 (cache) and DB1 (rate limiter) using the database selector + +Filter keys by prefix `rl:ip:` to see all anonymous IP counters, `rl:*:user:` for authenticated users. + +#### Populate synthetic test data + +```bash +# Inject test entries to validate key schema and blocking thresholds +python3 docker/scripts/redis_inject_test_data.py + +# Point at a non-default Redis +REDIS_URL=redis://sapl-redis.redis.svc:6379 python3 docker/scripts/redis_inject_test_data.py + +# Clear all synthetic entries written by the script +CLEAR=1 python3 docker/scripts/redis_inject_test_data.py +``` + +--- + +## 5. Phase 2 — Rate Limiting & Bot Mitigation + +**Goal**: Effective cross-pod throttling using shared Redis. +**Prerequisite**: Phase 1 (Redis deployed and `CACHE_BACKEND=redis`). + +### 5.1 Middleware Architecture + +```mermaid +flowchart TD + A([Request arrives at nginx]) --> B{SISMEMBER\nrl:bot:ua:blocked} + B -->|hit| Z1[429 — zero Django cost] + B -->|miss| C{GET\nrl:ip:blocked} + C -->|exists| Z2[429 — zero Django cost] + C -->|nil| D[proxy_pass to Gunicorn] + D --> E{authenticated?} + E -->|yes| F{INCR\nrl:{ns}:user:{id}:reqs\n>= 120/min?} + E -->|no| G{suspicious\nheaders?} + F -->|yes| Z3[SET user:blocked\n429] + F -->|no| H[call view] + G -->|yes| Z4[429] + G -->|no| I{INCR\nrl:ip:reqs\n>= 30/min?} + I -->|yes| Z5[SET ip:blocked\n429] + I -->|no| J{INCR\nrl:ns:ip:window\n>= 30/min?} + J -->|yes| Z6[SET ip:blocked\n429] + J -->|no| H + H --> K[Filesystem / ORM / Response] +``` + +### 5.2 RateLimitMiddleware Implementation + +```python +# sapl/middleware/ratelimit.py +import hashlib +import logging +import time + +from django.conf import settings +from django.core.cache import caches +from django.http import HttpResponse + +logger = logging.getLogger('sapl.ratelimit') + +BOT_UA_FRAGMENTS = [ + 'GPTBot', 'ClaudeBot', 'PerplexityBot', + 'Bytespider', 'AhrefsBot', 'meta-externalagent', + 'Chrome/98.0.4758', +] + + +def _sha256(s: str) -> str: + return hashlib.sha256(s.encode()).hexdigest() + + +def _is_suspicious_headers(request) -> bool: + # Real browsers send all three; bots frequently omit them + missing = sum([ + not request.META.get('HTTP_ACCEPT_LANGUAGE'), + not request.META.get('HTTP_ACCEPT'), + not request.META.get('HTTP_REFERER'), + ]) + return missing >= 2 + + +def _get_ip(request) -> str: + return ( + request.META.get('HTTP_X_FORWARDED_FOR', '').split(',')[0].strip() + or request.META.get('REMOTE_ADDR', '') + ) + + +class RateLimitMiddleware: + ANON_IP_THRESHOLD = 30 # req/min + AUTH_USER_THRESHOLD = 120 # req/min + BLOCK_TTL = 300 # seconds + + def __init__(self, get_response): + self.get_response = get_response + self._rl_cache = caches['ratelimit'] + + def __call__(self, request): + decision = self._evaluate(request) + if decision['action'] == 'block': + logger.warning('ratelimit_block', extra={ + 'ip': decision['ip'], + 'reason': decision['reason'], + 'ua': request.META.get('HTTP_USER_AGENT', ''), + 'path': request.path, + 'namespace': getattr(request, 'tenant', 'unknown'), + }) + return HttpResponse(status=429) + return self.get_response(request) + + def _evaluate(self, request): + ip = _get_ip(request) + + # Check 1: known UA (all requests) + ua = request.META.get('HTTP_USER_AGENT', '') + for fragment in BOT_UA_FRAGMENTS: + if fragment.lower() in ua.lower(): + return {'action': 'block', 'reason': 'known_ua', 'ip': ip} + + # Check 2: IP blocked marker + if self._rl_cache.get(f'rl:ip:{ip}:blocked'): + if not getattr(request, 'user', None) or not request.user.is_authenticated: + return {'action': 'block', 'reason': 'ip_blocked', 'ip': ip} + + if getattr(request, 'user', None) and request.user.is_authenticated: + return self._evaluate_authenticated(request, ip) + return self._evaluate_anonymous(request, ip) + + def _evaluate_authenticated(self, request, ip): + user_id = str(request.user.pk).lower().strip() + ns = getattr(request, 'tenant', 'global') + + if self._rl_cache.get(f'rl:{ns}:user:{user_id}:blocked'): + return {'action': 'block', 'reason': 'user_blocked', 'ip': ip} + + if _is_suspicious_headers(request): + return {'action': 'block', 'reason': 'suspicious_headers_auth', 'ip': ip} + + count = self._incr_with_ttl(f'rl:{ns}:user:{user_id}:reqs', ttl=60) + if count >= self.AUTH_USER_THRESHOLD: + self._rl_cache.set(f'rl:{ns}:user:{user_id}:blocked', 1, + timeout=self.BLOCK_TTL) + return {'action': 'block', 'reason': 'auth_user_rate', 'ip': ip} + + return {'action': 'pass', 'ip': ip} + + def _evaluate_anonymous(self, request, ip): + # Check 3: suspicious headers + if _is_suspicious_headers(request): + return {'action': 'block', 'reason': 'suspicious_headers', 'ip': ip} + + # Check 4: IP request rate + count = self._incr_with_ttl(f'rl:ip:{ip}:reqs', ttl=60) + if count >= self.ANON_IP_THRESHOLD: + self._rl_cache.set(f'rl:ip:{ip}:blocked', 1, timeout=self.BLOCK_TTL) + return {'action': 'block', 'reason': 'ip_rate', 'ip': ip} + + # Check 5: per-ns/ip/window (catches UA rotators) + ns = getattr(request, 'tenant', 'global') + bucket = int(time.time() // 60) + count = self._incr_with_ttl(f'rl:ns:{ns}:ip:{ip}:w:{bucket}', ttl=120) + if count >= self.ANON_IP_THRESHOLD: + self._rl_cache.set(f'rl:ip:{ip}:blocked', 1, timeout=self.BLOCK_TTL) + return {'action': 'block', 'reason': 'ua_rotation', 'ip': ip} + + return {'action': 'pass', 'ip': ip} + + def _incr_with_ttl(self, key: str, ttl: int) -> int: + """Atomic INCR + EXPIRE — TTL only set on key creation.""" + lua = """ + local n = redis.call('INCR', KEYS[1]) + if n == 1 then redis.call('EXPIRE', KEYS[1], ARGV[1]) end + return n + """ + client = self._rl_cache._cache.get_client() + return client.eval(lua, 1, key, ttl) +``` + +--- + +### 5.3 Settings Reference + +```python +# sapl/settings.py +MIDDLEWARE = [ + 'sapl.middleware.ratelimit.RateLimitMiddleware', # before session/auth + 'django.contrib.sessions.middleware.SessionMiddleware', + # ... rest unchanged +] + +RATE_LIMITER_RATE = config('RATE_LIMITER_RATE', default='35/m') +RATE_LIMITER_RATE_AUTHENTICATED = config('RATE_LIMITER_RATE_AUTHENTICATED', default='120/m') +RATE_LIMITER_RATE_BOT = config('RATE_LIMITER_RATE_BOT', default='5/m') + +# Optional / future — see Open Question 2 +RATE_LIMIT_WHITELIST_IPS = config( + 'RATE_LIMIT_WHITELIST_IPS', + default='', + cast=lambda v: [x.strip() for x in v.split(',') if x.strip()] +) +``` + +--- + +### 5.4 Enforcement Graduation Order + +Roll out to canary pods first; promote check-by-check in order of false-positive risk: + +| Order | Check | Risk | Condition to promote | +|---|---|---|---| +| 1st | `known_ua` | Zero | UA strings are deterministic | +| 2nd | `ip_blocked` | Zero | Key only set by prior proven-bad requests | +| 3rd | `ip_rate` | Low | Threshold calibrated from canary logs | +| 4th | `suspicious_headers` | Medium | Confirmed no legitimate clients omit all 3 headers | +| 5th | `ua_rotation` (ns/window) | Medium | NAT IP whitelist in place (see Open Question 2) | + +--- + +### 5.5 Decorator Migration + +For views where `django-ratelimit` decorators already exist: + +| Endpoint type | Action | Reason | +|---|---|---| +| List views (GET) | Remove after Phase 2 stable | Middleware covers equivalent threshold | +| Detail views (GET) | Remove after Phase 2 stable | Middleware covers equivalent threshold | +| Search / filter views | Remove last | Expensive queries — keep stricter per-view limit | +| PDF / file generation | **Keep permanently** | Most expensive; per-view limit tighter than global | +| Write endpoints (POST/PUT/DELETE) | **Keep permanently** | Different abuse surface | +| Auth endpoints (login, reset) | **Keep permanently** | Credential stuffing; must be independent | + +--- + +## 6. Phase 3 — File Serving Corrections + +**Goal**: Ensure nginx serves files correctly with kernel bypass and caching headers. +**Risk**: Low — config changes only. + +### 6.1 Confirmed Architecture + +nginx already serves `/media/` directly via `alias` — **Django is not involved in file serving for public media**. `X-Accel-Redirect` is only needed for LGPD-restricted documents that must pass through Django for access control. + +The corrected `nginx.conf` and `sapl.conf` are shown in Phase 0 §3.2. No additional changes needed here. + +### 6.2 Why Redis is NOT Needed for PDFs + +With the full mitigation stack active: +- **ASN blocking** (Phase 0) drops datacenter bot traffic at nginx +- **UA blocking** (Phase 0) drops known-UA bots at nginx +- **Shared Redis rate counters** (Phase 2) enforce limits across all pods +- **ETags** (Phase 0 §3.2) convert repeat requests to 304 with zero bytes transferred +- **`sendfile on`** (Phase 0 §3.2) means disk reads bypass userspace entirely + +Redis PDF caching would solve "high request volume reaching the file layer" — but that problem no longer exists once the above stack is active. Redis memory is better reserved for rate counters, page cache, and sessions. + +### 6.3 File Serving Decision Matrix + +| File type | Size | Strategy | +|---|---|---| +| Logos / images | Any | nginx `alias` + `sendfile` + ETag + `Cache-Control` | +| Small PDFs | ≤ 360 KB | nginx direct + ETag | +| Medium PDFs | 360 KB – 2 MB | nginx direct + ETag + rate limit | +| Large PDFs | > 2 MB | nginx + strict rate limit; never Redis | +| LGPD-restricted | Any | Django view → `X-Accel-Redirect` → nginx (access control enforced) | + +--- + +## 7. Phase 4 — Dynamic Page Caching + +**Goal**: Eliminate ORM queries for anonymous bot requests on list views. +**Prerequisite**: Phase 1 (shared Redis, `CACHE_BACKEND=redis`). + +### 7.1 The Key Insight + +Many SAPL list views (`pesquisar-materia`, `norma`, etc.) are not truly dynamic for anonymous users between edits. A bot hammering `?page=1` through `?page=100` triggers 100 ORM queries per pod. With Redis page cache, each unique URL is queried once per TTL across the entire fleet. + +```python +# views.py — apply to anonymous list views only +from django.views.decorators.cache import cache_page +from django.utils.decorators import method_decorator + +@method_decorator(cache_page(60 * 5), name='dispatch') # 5-minute TTL +class PesquisarMateriaView(FilterView): + ... +``` + +> **Critical safety check**: `cache_page` sets `Cache-Control: private` for authenticated sessions automatically. Verify this is working before deploying — accidentally caching a session-aware response is a data leak. + +### 7.2 Cache TTL Guidelines + +| View type | TTL | Reasoning | +|---|---|---| +| Matéria list (anonymous) | 300 s | Changes infrequently between sessions | +| Norma list (anonymous) | 300 s | Same | +| Parlamentar list | 3600 s | Changes rarely | +| Search results | 60 s | Query-dependent, shorter TTL safer | +| Authenticated views | Never | `cache_page` respects this automatically | +| PDF generation | Never | Too large — serve from disk via nginx | + +--- + +## 8. Open Questions + +| # | Question | Status | Blocks | +|---|---|---|---| +| 1 | Does Chrome/98.0.4758 impersonator appear consistently in nginx access logs? | Needs investigation | Phase 0 UA block safety | +| 2 | Which legislative house IPs can be pre-whitelisted in `RATE_LIMIT_WHITELIST_IPS`? | We don't have this list yet — plan to obtain in the future. Setting is **optional / future**. | Phase 2 enforcement safety | +| 3 | Dockerfile scope | Single image for all tenants (confirmed). All path-based Redis keys include `{ns}`. | — | +| 4 | WebSocket voting panel priority | Separate project. Resumes after Redis is on k8s, bot siege addressed, and OOM pressure reduced. | Phase 5 sequencing | +| 5 | `CONN_MAX_AGE` tuning | Currently **300 s** (`sapl/settings.py:272`). Evaluate whether to reduce given worker recycling at 400 MB. | Phase 0 tuning | +| 6 | k8s Redis manifests | Development artifacts go under `$PROJECT_ROOT/docker/k8s/` (redis-pod.yaml, redis-service.yaml, redis-configmap.yaml). | Phase 1 delivery | + +--- + +*Document consolidated from multi-session architecture review — Edward / Interlegis SAPL infrastructure.* diff --git a/sapl/base/media.py b/sapl/base/media.py new file mode 100644 index 000000000..a7b4e0480 --- /dev/null +++ b/sapl/base/media.py @@ -0,0 +1,96 @@ +""" +serve_media — X-Accel-Redirect gate for all /media/ files. + +Production flow (nginx proxies /media/ to Gunicorn): + 1. Django middleware runs (IP rate-limit, bot UA check, etc.). + 2. serve_media() runs auth check for documentos_privados/, writes + per-path counter to Redis DB 1, caches content-type in Redis DB 0. + 3. Returns an empty 200 with X-Accel-Redirect pointing to the nginx + internal location /_accel/media/. Nginx serves the bytes + directly from disk — Gunicorn worker is freed immediately. + +Development flow (DEBUG=True, nginx absent): + Falls back to django.views.static.serve for live file serving. + +Redis side-effects per request: + DB 1 rl:{ns}:path:{sha256}:reqs — per-path access counter, TTL=MEDIA_PATH_COUNTER_TTL + DB 0 file:{ns}:{sha256} — content-type metadata, TTL=MEDIA_FILE_CACHE_TTL + (sha256 is of the URL path, e.g. sha256('/media/2024/01/doc.pdf')) + Key template: FILE_META_KEY (sapl/middleware/ratelimit.py); TTLs in sapl/settings.py +""" + +import hashlib +import mimetypes +import os + +from django.conf import settings +from django.core.cache import caches +from django.http import Http404, HttpResponse +from django.views.static import serve + +from sapl import settings as sapl_settings +from sapl.middleware.ratelimit import ( + _NAMESPACE, + FILE_META_KEY, + RL_PATH_REQUESTS, + _incr_with_ttl, +) + + +def _safe_resolve(rel_path): + """ + Return the absolute path of rel_path inside MEDIA_ROOT. + Raises Http404 if the resolved path would escape the root + (path traversal guard). + """ + abs_root = os.path.abspath(settings.MEDIA_ROOT) + abs_path = os.path.abspath(os.path.join(abs_root, rel_path)) + if not abs_path.startswith(abs_root + os.sep) and abs_path != abs_root: + raise Http404 + return abs_path + + +def serve_media(request, path): + """ + Registered in sapl/urls.py for both DEBUG and production. + Route: ^media/(?P.*)$ + """ + # Path traversal guard — raises Http404 on escape attempt. + abs_path = _safe_resolve(path) + + # Auth gate for private documents — redirect to login if anonymous. + if path.startswith('documentos_privados/'): + user = getattr(request, 'user', None) + if user is None or not user.is_authenticated: + from django.contrib.auth.views import redirect_to_login + return redirect_to_login(request.get_full_path()) + + # Per-path rate counter (DB 1) — key uses URL path so that storage + # location changes in the next PR don't reset existing counters. + path_hash = hashlib.sha256(f'/media/{path}'.encode()).hexdigest() + _incr_with_ttl( + RL_PATH_REQUESTS.format(ns=_NAMESPACE, sha256=path_hash), + ttl=sapl_settings.MEDIA_PATH_COUNTER_TTL, + ) + + # Content-type metadata cache (DB 0) — avoids mimetypes.guess_type + # and os.path.isfile on every hit for hot files. + file_key = FILE_META_KEY.format(ns=_NAMESPACE, sha256=path_hash) + content_type = caches['default'].get(file_key) + if content_type is None: + if not os.path.isfile(abs_path): + raise Http404 + guessed, _ = mimetypes.guess_type(abs_path) + content_type = guessed or 'application/octet-stream' + caches['default'].set(file_key, content_type, timeout=sapl_settings.MEDIA_FILE_CACHE_TTL) + + if settings.DEBUG: + # Development: no nginx present; serve the file directly. + return serve(request, path, document_root=settings.MEDIA_ROOT) + + # Production: tell nginx to serve the file from the internal location. + response = HttpResponse(content_type=content_type) + response['X-Accel-Redirect'] = f'/_accel/media/{path}' + response['Cache-Control'] = 'public, max-age=86400, stale-while-revalidate=3600' + response['X-Robots-Tag'] = 'noindex' + return response diff --git a/sapl/middleware/ratelimit.py b/sapl/middleware/ratelimit.py index 38e29110c..c0f3e850a 100644 --- a/sapl/middleware/ratelimit.py +++ b/sapl/middleware/ratelimit.py @@ -2,7 +2,8 @@ RateLimitMiddleware — cross-pod rate limiting backed by shared Redis. Decision flow (per request): - 1. Known bot UA? → 429 + 1. Known bot UA? → 429 (Python list — substring match) + 1b. Redis UA deny list? → 429 (runtime SET — token hash match, refreshed every 60 s) 2. IP in blocked set? → 429 3. Authenticated user? a. User blocked? → 429 @@ -13,7 +14,6 @@ Decision flow (per request): b. IP rate ≥ 35/min? → SET RL_IP_BLOCKED, 429 c. NS/IP window hit? → SET RL_IP_BLOCKED, 429 -All decisions are no-ops when RATELIMIT_DRY_RUN=True (logged only). Degrades gracefully to non-atomic counting when Redis is unavailable. _NAMESPACE is settings.POD_NAMESPACE, resolved once at startup: @@ -27,9 +27,10 @@ no per-request lookup is needed or correct. import hashlib import logging +import re import time -from django.conf import settings +from sapl import settings from django.core.cache import caches from django.http import HttpResponse @@ -52,6 +53,9 @@ RL_IP_BLOCKED = 'rl:ip:{ip}:blocked' RL_USER_REQUESTS = 'rl:{ns}:user:{uid}:reqs' RL_USER_BLOCKED = 'rl:{ns}:user:{uid}:blocked' RL_NS_WINDOW = 'rl:{ns}:ip:{ip}:w:{bucket}' +RL_PATH_REQUESTS = 'rl:{ns}:path:{sha256}:reqs' +RL_UA_BLOCKLIST = 'rl:bot:ua:blocked' # permanent SET — runtime UA deny list +FILE_META_KEY = 'file:{ns}:{sha256}' # content-type metadata cache (DB 0) # --------------------------------------------------------------------------- # Bot UA fragments @@ -169,31 +173,67 @@ def _parse_rate(rate_str): return count, seconds +def _incr_with_ttl(key, ttl): + """ + Atomic INCR + EXPIRE via Redis Lua script (ratelimit cache, DB 1). + Falls back to non-atomic cache get/set when Redis is unavailable. + Exported at module level so sapl.base.media can reuse it for path counters. + """ + try: + from django_redis import get_redis_connection + client = get_redis_connection('ratelimit') + return client.eval(_INCR_LUA, 1, key, ttl) + except Exception: + rl_cache = caches['ratelimit'] + count = (rl_cache.get(key) or 0) + 1 + rl_cache.set(key, count, timeout=ttl) + return count + + class RateLimitMiddleware: BLOCK_TTL = 300 # seconds an IP/user stays blocked after threshold breach + # In-process cache for the Redis UA deny list. + # Shared across all instances in the same worker process (one per worker). + # Refreshed every RATE_LIMITER_UA_BLOCKLIST_REFRESH seconds via SMEMBERS. + _ua_blocklist: set = set() + _ua_blocklist_fetched_at: float = 0.0 + def __init__(self, get_response): self.get_response = get_response - self.dry_run = settings.RATELIMIT_DRY_RUN self.anon_threshold, self.anon_window = _parse_rate(settings.RATE_LIMITER_RATE) self.auth_threshold, self.auth_window = _parse_rate(settings.RATE_LIMITER_RATE_AUTHENTICATED) self.whitelist = set(settings.RATE_LIMIT_WHITELIST_IPS) self._rl_cache = caches['ratelimit'] + logger.info( + '[RATELIMIT] anon=%s auth=%s bot=%s whitelist=%s', + settings.RATE_LIMITER_RATE, + settings.RATE_LIMITER_RATE_AUTHENTICATED, + settings.RATE_LIMITER_RATE_BOT, + list(self.whitelist) or '(none)', + ) def __call__(self, request): decision = self._evaluate(request) if decision['action'] == 'block': logger.warning( - 'ratelimit_block reason=%s ip=%s path=%s dry_run=%s namespace=%s', + 'ratelimit_block reason=%s ip=%s path=%s namespace=%s', decision['reason'], decision['ip'], request.path, - self.dry_run, _NAMESPACE, extra={'ua': request.META.get('HTTP_USER_AGENT', '')}, ) - if not self.dry_run: - return HttpResponse(status=429) + response = HttpResponse(status=429) + response['Retry-After'] = self.BLOCK_TTL + return response + logger.debug( + 'ratelimit_pass ip=%s path=%s user=%s namespace=%s', + decision['ip'], + request.path, + getattr(getattr(request, 'user', None), 'pk', 'anon'), + _NAMESPACE, + ) return self.get_response(request) # ------------------------------------------------------------------ @@ -206,12 +246,16 @@ class RateLimitMiddleware: if ip in self.whitelist: return {'action': 'pass', 'ip': ip} - # Check 1: known bad UA + # Check 1: known bad UA (hardcoded Python list — substring match) ua = request.META.get('HTTP_USER_AGENT', '') for fragment in BOT_UA_FRAGMENTS: if fragment.lower() in ua.lower(): return {'action': 'block', 'reason': 'known_ua', 'ip': ip} + # Check 1b: runtime UA deny list (Redis SET — token hash match) + if self._is_redis_blocked_ua(ua): + return {'action': 'block', 'reason': 'redis_ua', 'ip': ip} + # Check 2: IP already blocked if self._rl_cache.get(RL_IP_BLOCKED.format(ip=ip)): return {'action': 'block', 'reason': 'ip_blocked', 'ip': ip} @@ -268,20 +312,46 @@ class RateLimitMiddleware: return {'action': 'pass', 'ip': ip} # ------------------------------------------------------------------ - # Helpers + # Helpers — delegate to module-level so media.py can reuse them # ------------------------------------------------------------------ def _incr_with_ttl(self, key, ttl): + return _incr_with_ttl(key, ttl) + + def _refresh_ua_blocklist(self): """ - Atomic INCR + EXPIRE via Redis Lua script. - Falls back to non-atomic cache get/set when Redis is unavailable - (dry-run mode or file-based cache — correct enough for logging). + Fetch the full UA deny list from Redis DB 1 (SMEMBERS). + Stores sha256 hex-strings in the class-level set. + Falls back silently — an empty set means no runtime blocks. """ try: from django_redis import get_redis_connection client = get_redis_connection('ratelimit') - return client.eval(_INCR_LUA, 1, key, ttl) - except Exception: - count = (self._rl_cache.get(key) or 0) + 1 - self._rl_cache.set(key, count, timeout=ttl) - return count + raw = client.smembers(RL_UA_BLOCKLIST) + RateLimitMiddleware._ua_blocklist = { + m.decode() if isinstance(m, bytes) else m for m in raw + } + RateLimitMiddleware._ua_blocklist_fetched_at = time.time() + logger.debug('[RATELIMIT] ua_blocklist refreshed entries=%d', len(raw)) + except Exception as exc: + logger.debug('[RATELIMIT] ua_blocklist refresh skipped: %s', exc) + + def _is_redis_blocked_ua(self, ua): + """ + Return True if any slash/space/semicolon token in `ua` has a sha256 + that appears in the Redis UA deny list. + + The SET stores sha256(fragment) — e.g. sha256('GPTBot'). + Tokenising by common UA separators means 'GPTBot/1.1 (OpenAI)' + produces token 'GPTBot' whose hash matches the seeded entry. + Degrades to False when Redis is unavailable. + """ + if time.time() - self._ua_blocklist_fetched_at > settings.RATE_LIMITER_UA_BLOCKLIST_REFRESH: + self._refresh_ua_blocklist() + if not self._ua_blocklist: + return False + tokens = re.split(r'[\s/;()+,]+', ua) + return any( + hashlib.sha256(t.encode()).hexdigest() in self._ua_blocklist + for t in tokens if t + ) diff --git a/sapl/settings.py b/sapl/settings.py index 1fae74235..e197e7d4a 100644 --- a/sapl/settings.py +++ b/sapl/settings.py @@ -43,7 +43,7 @@ ALLOWED_HOSTS = ['*'] LOGIN_REDIRECT_URL = '/' LOGIN_URL = '/login/?next=' -SAPL_VERSION = '3.1.165-RC2' +SAPL_VERSION = '3.1.164-RC5' if DEBUG: EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' @@ -146,8 +146,6 @@ MIDDLEWARE = [ 'sapl.middleware.endpoint_restriction.EndpointRestrictionMiddleware', 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', - # RateLimitMiddleware runs after AuthenticationMiddleware so it can - # distinguish authenticated users (higher threshold) from anonymous ones. 'sapl.middleware.ratelimit.RateLimitMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware', @@ -222,61 +220,68 @@ POD_NAMESPACE = config('POD_NAMESPACE', default=host) REDIS_URL = config('REDIS_URL', default='') CACHE_BACKEND = config('CACHE_BACKEND', default='file') -_redis_ready = CACHE_BACKEND == 'redis' and bool(REDIS_URL) -_redis_pool = { - 'max_connections': 6, # 1,200 pods × 2 workers × 6 = 14,400 peak - 'socket_timeout': 0.5, - 'socket_connect_timeout': 0.5, -} - -CACHES = { - # DB0 — page / view / static-file cache - 'default': { - 'BACKEND': ( - 'django_redis.cache.RedisCache' if _redis_ready - else 'django.core.cache.backends.filebased.FileBasedCache' - ), - 'LOCATION': REDIS_URL + '/0' if _redis_ready else '/var/tmp/django_cache', - 'KEY_PREFIX': f'cache:{POD_NAMESPACE}', - **( - { +def _build_cache_layer(pod_namespace, cache_backend, redis_url): + """ + Return the CACHES dict for the given runtime environment. + + Two backends are always defined: + default — DB 0: page/view/static-file cache, KEY_PREFIX isolates tenants. + ratelimit — DB 1: rate-limiter counters; pass-through KEY_FUNCTION keeps + raw 'rl:*' keys consistent between RateLimitMiddleware + (get_redis_connection) and @ratelimit decorator paths. + + Redis path: both backends share the same connection-pool settings so a + single pool object is created once and referenced by both caches. + File path: used in development and as a fallback when Redis is absent. + """ + if cache_backend == 'redis' and bool(redis_url): + _pool = { + 'max_connections': 6, # 1,200 pods × 2 workers × 6 = 14,400 peak + 'socket_timeout': 0.5, + 'socket_connect_timeout': 0.5, + } + return { + 'default': { + 'BACKEND': 'django_redis.cache.RedisCache', + 'LOCATION': f'{redis_url}/0', + 'KEY_PREFIX': f'cache:{pod_namespace}', + 'TIMEOUT': 300, 'OPTIONS': { 'CLIENT_CLASS': 'django_redis.client.DefaultClient', - 'CONNECTION_POOL_KWARGS': _redis_pool, - 'IGNORE_EXCEPTIONS': True, # degrades to cache miss on failure + 'CONNECTION_POOL_KWARGS': _pool, + 'IGNORE_EXCEPTIONS': True, # degrades to cache miss on Redis failure }, - 'TIMEOUT': 300, - } if _redis_ready else { - 'OPTIONS': {'MAX_ENTRIES': 10000}, - } - ), - }, - # DB1 — rate-limiter counters (raw keys, no KEY_PREFIX / version mangling) - 'ratelimit': { - 'BACKEND': ( - 'django_redis.cache.RedisCache' if _redis_ready - else 'django.core.cache.backends.filebased.FileBasedCache' - ), - 'LOCATION': REDIS_URL + '/1' if _redis_ready else '/var/tmp/django_ratelimit_cache', - # Pass-through key function so django-ratelimit decorator keys ('rl:{hash}') - # are stored as-is, matching the 'rl:*' keys written directly by - # RateLimitMiddleware via get_redis_connection(). Without this, Django's - # default key function would produce ':1:rl:{hash}' (empty prefix + version). - 'KEY_FUNCTION': 'sapl.middleware.ratelimit.make_ratelimit_cache_key', - **( - { + }, + 'ratelimit': { + 'BACKEND': 'django_redis.cache.RedisCache', + 'LOCATION': f'{redis_url}/1', + 'KEY_FUNCTION': 'sapl.middleware.ratelimit.make_ratelimit_cache_key', 'OPTIONS': { 'CLIENT_CLASS': 'django_redis.client.DefaultClient', - 'CONNECTION_POOL_KWARGS': _redis_pool, + 'CONNECTION_POOL_KWARGS': _pool, 'IGNORE_EXCEPTIONS': True, }, - } if _redis_ready else { - 'OPTIONS': {'MAX_ENTRIES': 5000}, - } - ), - }, -} + }, + } + + return { + 'default': { + 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', + 'LOCATION': '/var/tmp/django_cache', + 'KEY_PREFIX': f'cache:{pod_namespace}', + 'OPTIONS': {'MAX_ENTRIES': 10000}, + }, + 'ratelimit': { + 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', + 'LOCATION': '/var/tmp/django_ratelimit_cache', + 'KEY_FUNCTION': 'sapl.middleware.ratelimit.make_ratelimit_cache_key', + 'OPTIONS': {'MAX_ENTRIES': 5000}, + }, + } + + +CACHES = _build_cache_layer(POD_NAMESPACE, CACHE_BACKEND, REDIS_URL) RATELIMIT_USE_CACHE = 'ratelimit' @@ -394,10 +399,6 @@ FILE_UPLOAD_TEMP_DIR = '/var/interlegis/sapl/tmp' # --------------------------------------------------------------------------- # Rate limiting — RateLimitMiddleware (sapl/middleware/ratelimit.py) # --------------------------------------------------------------------------- -# Start with RATELIMIT_DRY_RUN=True; flip to False one check at a time -# after validating in logs that no legitimate traffic is flagged. -RATELIMIT_DRY_RUN = config('RATELIMIT_DRY_RUN', default=True, cast=bool) - RATE_LIMITER_RATE = config('RATE_LIMITER_RATE', default='35/m') RATE_LIMITER_RATE_AUTHENTICATED = config('RATE_LIMITER_RATE_AUTHENTICATED', default='120/m') RATE_LIMITER_RATE_BOT = config('RATE_LIMITER_RATE_BOT', default='5/m') @@ -410,6 +411,14 @@ RATE_LIMIT_WHITELIST_IPS = config( cast=lambda v: [x.strip() for x in v.split(',') if x.strip()], ) +# Seconds between re-fetches of the runtime UA deny list from Redis DB 1. +# Lower values pick up new blocked UAs faster; higher values reduce Redis round-trips. +RATE_LIMITER_UA_BLOCKLIST_REFRESH = config('RATE_LIMITER_UA_BLOCKLIST_REFRESH', default=60, cast=int) + +# Media file serving — serve_media (sapl/base/media.py) via X-Accel-Redirect. +MEDIA_PATH_COUNTER_TTL = config('MEDIA_PATH_COUNTER_TTL', default=60, cast=int) # seconds — per-path counter window +MEDIA_FILE_CACHE_TTL = config('MEDIA_FILE_CACHE_TTL', default=3600, cast=int) # seconds — content-type metadata TTL + # --------------------------------------------------------------------------- # Anonymous page caching — AnonCachePageMixin (sapl/middleware/page_cache.py) # TTLs apply only to anonymous (unauthenticated) GET responses. @@ -422,6 +431,13 @@ PAGE_CACHE_TTL_DETAIL = config('PAGE_CACHE_TTL_DETAIL', default=300, cast=int) # High-stability detail views (parlamentar, comissão) — change only each term PAGE_CACHE_TTL_STABLE = config('PAGE_CACHE_TTL_STABLE', default=600, cast=int) +logger.info( + '[PAGE_CACHE] list=%ds detail=%ds stable=%ds', + PAGE_CACHE_TTL_LIST, + PAGE_CACHE_TTL_DETAIL, + PAGE_CACHE_TTL_STABLE, +) + # Internationalization # https://docs.djangoproject.com/en/1.8/topics/i18n/ LANGUAGE_CODE = 'pt-br' @@ -512,7 +528,7 @@ CRISPY_FAIL_SILENTLY = not DEBUG FILTERS_HELP_TEXT_FILTER = False LOGGING_CONSOLE_VERBOSE = config( - 'LOGGING_CONSOLE_VERBOSE', cast=bool, default=False) + 'LOGGING_CONSOLE_VERBOSE', cast=bool, default=True) LOGGING = { 'version': 1, diff --git a/sapl/urls.py b/sapl/urls.py index d2967e927..dcb3010b1 100644 --- a/sapl/urls.py +++ b/sapl/urls.py @@ -82,6 +82,13 @@ urlpatterns += [ ] +# Media files — served via X-Accel-Redirect in production, directly in DEBUG. +from sapl.base.media import serve_media # noqa: E402 + +urlpatterns += [ + url(r'^media/(?P.*)$', serve_media, name='serve_media'), +] + # Fix a static asset finding error on Django 1.9 + gunicorn: # http://stackoverflow.com/questions/35510373/ @@ -95,11 +102,7 @@ if settings.DEBUG: urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT) - urlpatterns += [ - url(r'^media/(?P.*)$', view_static_server, { - 'document_root': settings.MEDIA_ROOT, - }), - ] + # media/ is handled by serve_media below (works in DEBUG too) # Make the rate limiter return 429 (Too Many Requests) instead of 403 (Forbidden Access)