From 12f6a3e396f55f8675da5d4af1ad3d0867145614 Mon Sep 17 00:00:00 2001 From: Edward Oliveira Date: Mon, 13 Apr 2026 23:10:58 -0300 Subject: [PATCH] =?UTF-8?q?Phase=201:=20shared=20Redis=20pod=20=E2=80=94?= =?UTF-8?q?=20Django=20dual-backend=20cache=20+=20startup=20wiring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit django/settings.py: - REDIS_URL / CACHE_BACKEND env vars read at startup (written by start.sh) - CACHES['default'] (DB0, KEY_PREFIX='sapl') switches between django-redis and FileBasedCache transparently; IGNORE_EXCEPTIONS=True for graceful degradation on Redis failure - CACHES['ratelimit'] (DB1, no prefix) for cross-pod rate-limit counters - RATELIMIT_USE_CACHE = 'ratelimit' - Connection pool capped at 6/worker (1,200 pods × 2 workers × 6 = 14,400 peak connections; maxclients=20,000 gives 40% headroom) start.sh: - resolve_redis_url(): reads REDIS_URL from local namespace Secret (envFrom) or falls back to global cluster Secret via k8s API - configure_redis_cache(): ensures REDIS_CACHE waffle switch row exists (off) - resolve_cache_backend(): reads waffle switch; sets CACHE_BACKEND=redis|file - wait_for_redis(): blocks until Redis reachable; falls back gracefully - write_env_file() now persists REDIS_URL + CACHE_BACKEND into pod .env k8s manifests (docker/k8s/): - redis-configmap.yaml: no persistence, allkeys-lru, maxmemory=5gb, maxclients=20000, activedefrag, 4 databases - redis-deployment.yaml: redis:7-alpine, 1 replica, liveness/readiness probes, 1Gi request / 6Gi limit - redis-service.yaml: ClusterIP on port 6379 requirements: add django-redis==5.4.0 Co-Authored-By: Claude Sonnet 4.6 --- docker/k8s/README.md | 228 +++++++++++++++++++++++++++++++ docker/k8s/redis-configmap.yaml | 35 +++++ docker/k8s/redis-deployment.yaml | 48 +++++++ docker/k8s/redis-service.yaml | 15 ++ docker/startup_scripts/start.sh | 92 ++++++++++++- requirements/requirements.txt | 5 + sapl/settings.py | 61 ++++++++- 7 files changed, 479 insertions(+), 5 deletions(-) create mode 100644 docker/k8s/README.md create mode 100644 docker/k8s/redis-configmap.yaml create mode 100644 docker/k8s/redis-deployment.yaml create mode 100644 docker/k8s/redis-service.yaml diff --git a/docker/k8s/README.md b/docker/k8s/README.md new file mode 100644 index 000000000..34d90b64c --- /dev/null +++ b/docker/k8s/README.md @@ -0,0 +1,228 @@ +# SAPL — Kubernetes Redis + +Manifests for the shared Redis instance used by all SAPL pods for +cross-pod rate limiting (DB 1) and view/static-file caching (DB 0). + +--- + +## Directory layout + +``` +docker/k8s/ +├── redis-configmap.yaml # redis.conf — no persistence, allkeys-lru, 5 GB ceiling +├── redis-deployment.yaml # Deployment (1 replica, redis:7-alpine) +├── redis-service.yaml # ClusterIP service on port 6379 +└── README.md # this file +``` + +--- + +## Prerequisites + +- `kubectl` configured to talk to the target cluster. +- A `redis` namespace (created below if it doesn't exist). + +--- + +## Deploy + +```bash +# 1. Create the namespace (idempotent) +kubectl create namespace redis --dry-run=client -o yaml | kubectl apply -f - + +# 2. Apply all three manifests +kubectl apply -f docker/k8s/redis-configmap.yaml +kubectl apply -f docker/k8s/redis-deployment.yaml +kubectl apply -f docker/k8s/redis-service.yaml + +# 3. Verify the pod is Running +kubectl -n redis get pods -l app=sapl-redis +``` + +Expected output: +``` +NAME READY STATUS RESTARTS AGE +sapl-redis-6d9f8b7c4d-xk2lm 1/1 Running 0 30s +``` + +--- + +## Wire a SAPL namespace to Redis + +```bash +# Create the per-namespace Secret (one-off per tenant) +kubectl create secret generic sapl-redis \ + --namespace= \ + --from-literal=REDIS_URL="redis://sapl-redis.redis.svc.cluster.local:6379" \ + --dry-run=client -o yaml | kubectl apply -f - + +# Ensure the waffle switch row exists (starts OFF) +kubectl exec -n deploy/sapl -- \ + python manage.py waffle_switch REDIS_CACHE off --create + +# Enable Redis for this namespace +kubectl exec -n deploy/sapl -- \ + python manage.py waffle_switch REDIS_CACHE on + +# Rolling restart so start.sh picks up the new switch value +kubectl rollout restart deployment/sapl -n +kubectl rollout status deployment/sapl -n +``` + +### Fleet-wide rollout + +```bash +kubectl get namespaces -l app=sapl -o name | sed 's|namespace/||' | \ + xargs -P 10 -I{} kubectl exec -n {} deploy/sapl -- \ + python manage.py waffle_switch REDIS_CACHE on --create + +kubectl get namespaces -l app=sapl -o name | sed 's|namespace/||' | \ + xargs -P 5 -I{} kubectl rollout restart deployment/sapl -n {} +``` + +### Roll back (without removing the Secret) + +```bash +kubectl exec -n deploy/sapl -- \ + python manage.py waffle_switch REDIS_CACHE off +kubectl rollout restart deployment/sapl -n +``` + +--- + +## Monitor + +### Pod and events + +```bash +# Pod status +kubectl -n redis get pods -l app=sapl-redis -o wide + +# Deployment events (useful right after apply) +kubectl -n redis describe deployment sapl-redis + +# Pod events (OOMKill, restarts, etc.) +kubectl -n redis describe pod -l app=sapl-redis +``` + +### Logs + +```bash +# Tail live logs +kubectl -n redis logs -f deploy/sapl-redis + +# Last 100 lines +kubectl -n redis logs deploy/sapl-redis --tail=100 +``` + +### Redis INFO + +```bash +# Memory usage +kubectl exec -n redis deploy/sapl-redis -- \ + redis-cli info memory \ + | grep -E 'used_memory_human|maxmemory_human|mem_fragmentation_ratio' + +# Connection pressure +kubectl exec -n redis deploy/sapl-redis -- \ + redis-cli info stats \ + | grep -E 'rejected_connections|instantaneous_ops_per_sec' + +# Key distribution per DB +kubectl exec -n redis deploy/sapl-redis -- redis-cli info keyspace + +# Recent slow queries +kubectl exec -n redis deploy/sapl-redis -- redis-cli slowlog get 10 + +# Live command sampling (1-second window) +kubectl exec -n redis deploy/sapl-redis -- redis-cli --latency-history -i 1 +``` + +### Rate-limiter keys (DB 1) + +```bash +kubectl exec -n redis deploy/sapl-redis -- \ + redis-cli -n 1 dbsize + +kubectl exec -n redis deploy/sapl-redis -- \ + redis-cli -n 1 --scan --pattern 'rl:ip:*' | head -20 +``` + +--- + +## Seed the UA deny list (once after first deploy) + +```bash +kubectl exec -n redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:bot:ua:blocked \ + "$(echo -n 'GPTBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'ClaudeBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'PerplexityBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'Bytespider' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'AhrefsBot' | sha256sum | cut -d' ' -f1)" \ + "$(echo -n 'meta-externalagent' | sha256sum | cut -d' ' -f1)" + +# Add a new offender at runtime (no restart required) +kubectl exec -n redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:bot:ua:blocked "$(echo -n 'NewBot/1.0' | sha256sum | cut -d' ' -f1)" +``` + +--- + +## Local standalone Redis (development / testing) + +No Kubernetes? Run Redis directly with Docker: + +```bash +sudo docker run --rm -p 6379:6379 redis:7-alpine \ + redis-server --save "" --appendonly no +``` + +Then point Django at it by exporting the env var before starting the dev server: + +```bash +export REDIS_URL="redis://localhost:6379" +export CACHE_BACKEND="redis" +python manage.py runserver +``` + +Or add them to your local `.env` file: + +``` +REDIS_URL=redis://localhost:6379 +CACHE_BACKEND=redis +``` + +> **Note**: the waffle switch `REDIS_CACHE` must also be `on` in your local +> database for `start.sh` to activate the Redis backend. Run: +> ```bash +> python manage.py waffle_switch REDIS_CACHE on --create +> ``` + +--- + +## Update `redis.conf` without redeploying + +```bash +# Edit the ConfigMap +kubectl -n redis edit configmap redis-config + +# Restart the pod to pick up the new config +kubectl -n redis rollout restart deployment/sapl-redis +``` + +--- + +## Key schema reference + +| DB | Use case | Key pattern | TTL | +|----|----------|-------------|-----| +| 0 | Page / view cache | `sapl:cache:*` | 60 – 3 600 s | +| 0 | Static file cache (logos) | `static:{ns}:{sha256}` | 3 – 24 h | +| 0 | PDF cache (≤ 360 KB) | `file:{ns}:{sha256}` | 1 h | +| 1 | IP rate-limit counter | `rl:ip:{ip}:reqs` | 60 s | +| 1 | IP blocked marker | `rl:ip:{ip}:blocked` | 300 s | +| 1 | User rate-limit counter | `rl:{ns}:user:{id}:reqs` | 60 s | +| 1 | Path counter | `rl:{ns}:path:{sha256}:reqs` | 60 s | +| 1 | UA deny list | `rl:bot:ua:blocked` | permanent SET | +| 2 | Django Channels (future) | `channels:*` | session TTL | diff --git a/docker/k8s/redis-configmap.yaml b/docker/k8s/redis-configmap.yaml new file mode 100644 index 000000000..372d58975 --- /dev/null +++ b/docker/k8s/redis-configmap.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: redis-config + namespace: redis +data: + redis.conf: | + save "" + appendonly no + + maxmemory 5gb + maxmemory-policy allkeys-lru + maxmemory-samples 10 + + maxclients 20000 + tcp-backlog 511 + timeout 300 + tcp-keepalive 60 + + hz 20 + lazyfree-lazy-eviction yes + lazyfree-lazy-expire yes + lazyfree-lazy-server-del yes + + slowlog-log-slower-than 10000 + slowlog-max-len 256 + latency-monitor-threshold 10 + + bind 0.0.0.0 + protected-mode no + databases 4 # DB0: cache, DB1: rate limiter, DB2: channels (future) + + activedefrag yes + active-defrag-ignore-bytes 100mb + active-defrag-threshold-lower 10 diff --git a/docker/k8s/redis-deployment.yaml b/docker/k8s/redis-deployment.yaml new file mode 100644 index 000000000..732faff11 --- /dev/null +++ b/docker/k8s/redis-deployment.yaml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sapl-redis + namespace: redis + labels: + app: sapl-redis +spec: + replicas: 1 + selector: + matchLabels: + app: sapl-redis + template: + metadata: + labels: + app: sapl-redis + spec: + containers: + - name: redis + image: redis:7-alpine + command: ["redis-server", "/etc/redis/redis.conf"] + resources: + requests: + memory: "1Gi" + cpu: "250m" + limits: + memory: "6Gi" + cpu: "1000m" + ports: + - containerPort: 6379 + livenessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 10 + periodSeconds: 15 + failureThreshold: 3 + readinessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 5 + periodSeconds: 10 + volumeMounts: + - name: redis-config + mountPath: /etc/redis + volumes: + - name: redis-config + configMap: + name: redis-config diff --git a/docker/k8s/redis-service.yaml b/docker/k8s/redis-service.yaml new file mode 100644 index 000000000..8e4fcd3e9 --- /dev/null +++ b/docker/k8s/redis-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: sapl-redis + namespace: redis + labels: + app: sapl-redis +spec: + selector: + app: sapl-redis + ports: + - name: redis + port: 6379 + targetPort: 6379 + type: ClusterIP diff --git a/docker/startup_scripts/start.sh b/docker/startup_scripts/start.sh index bd98bdfc0..2b39cef82 100755 --- a/docker/startup_scripts/start.sh +++ b/docker/startup_scripts/start.sh @@ -104,6 +104,8 @@ write_env_file() { : "${RF:=1}" : "${MAX_SHARDS_PER_NODE:=1}" : "${ENABLE_SAPN:=False}" + : "${REDIS_URL:=}" + : "${CACHE_BACKEND:=file}" tmp="$(mktemp)" { @@ -126,6 +128,8 @@ write_env_file() { printf 'RF=%s\n' "$RF" printf 'MAX_SHARDS_PER_NODE=%s\n' "$MAX_SHARDS_PER_NODE" printf 'ENABLE_SAPN=%s\n' "$ENABLE_SAPN" + printf 'REDIS_URL=%s\n' "$REDIS_URL" + printf 'CACHE_BACKEND=%s\n' "$CACHE_BACKEND" } > "$tmp" chmod 600 "$tmp" @@ -256,6 +260,88 @@ setup_cache_dir() { umask 0007 } +# --------------------------------------------------------------------------- +# Redis — resolve URL, check waffle switch, wait for connectivity +# --------------------------------------------------------------------------- + +# 1. Populate REDIS_URL from local Secret (envFrom) or fall back to global +# cluster Secret read via the k8s API. +resolve_redis_url() { + # Already injected by pod's envFrom (local namespace Secret) — highest precedence. + [[ -n "${REDIS_URL:-}" ]] && { log "REDIS_URL from local secret."; return 0; } + + # Try the global cluster Secret via the k8s in-cluster API. + local api="https://kubernetes.default.svc" + local token_file="/var/run/secrets/kubernetes.io/serviceaccount/token" + local ca="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + + [[ -f "$token_file" ]] || { log "No k8s service-account token — skipping global Redis secret."; return 0; } + + local token url + token="$(<"$token_file")" + url=$(curl -sf --cacert "$ca" \ + -H "Authorization: Bearer $token" \ + "${api}/api/v1/namespaces/interlegis-infra/secrets/sapl-global-redis" \ + | python3 -c " +import sys, json, base64 +d = json.load(sys.stdin).get('data', {}) +v = d.get('REDIS_URL', '') +print(base64.b64decode(v).decode() if v else '') +" 2>/dev/null || echo "") + + if [[ -n "$url" ]]; then + export REDIS_URL="$url" + log "REDIS_URL from global cluster secret." + else + log "No REDIS_URL found — file-based cache will be used." + fi +} + +# 2. Check the REDIS_CACHE waffle switch; set CACHE_BACKEND accordingly. +resolve_cache_backend() { + [[ -z "${REDIS_URL:-}" ]] && return 0 + log "REDIS_URL set — checking REDIS_CACHE waffle switch..." + local active + active=$(psql "$DATABASE_URL" -At -v ON_ERROR_STOP=0 \ + -c "SELECT active FROM waffle_switch WHERE name='REDIS_CACHE' LIMIT 1;" \ + 2>/dev/null || echo "") + if [[ "$active" == "t" ]]; then + export CACHE_BACKEND="redis" + log "REDIS_CACHE switch ON — activating Redis cache backend." + else + export CACHE_BACKEND="file" + log "REDIS_CACHE switch OFF — using file-based cache." + fi +} + +# 3. Ensure the REDIS_CACHE waffle switch row exists (default: off). +configure_redis_cache() { + [[ -z "${REDIS_URL:-}" ]] && return 0 + log "Ensuring REDIS_CACHE waffle switch exists (default: off)..." + python3 manage.py waffle_switch REDIS_CACHE off --create || true +} + +# 4. Block until Redis is reachable (or give up gracefully). +wait_for_redis() { + [[ -z "${REDIS_URL:-}" ]] && return 0 + [[ "${CACHE_BACKEND:-file}" != "redis" ]] && return 0 + log "Checking Redis connectivity..." + local host port retries=10 + host=$(python3 -c "from urllib.parse import urlparse; u=urlparse('${REDIS_URL}'); print(u.hostname or 'localhost')") + port=$(python3 -c "from urllib.parse import urlparse; u=urlparse('${REDIS_URL}'); print(u.port or 6379)") + until python3 -c "import socket; s=socket.create_connection(('$host',$port),2); s.close()" 2>/dev/null; do + retries=$((retries - 1)) + if [[ $retries -eq 0 ]]; then + log "WARNING: Redis unreachable after retries — falling back to file cache." + export CACHE_BACKEND="file" + return 0 + fi + log "Waiting for Redis at $host:$port... ($retries retries left)" + sleep 2 + done + log "Redis reachable at $host:$port." +} + start_services() { log "Starting gunicorn..." gunicorn -c gunicorn.conf.py & @@ -265,10 +351,14 @@ start_services() { main() { create_secret - write_env_file + resolve_redis_url wait_for_pg configure_pg_timezone migrate_db + configure_redis_cache + resolve_cache_backend + wait_for_redis + write_env_file # writes resolved REDIS_URL + CACHE_BACKEND into .env configure_solr || true configure_sapn create_admin diff --git a/requirements/requirements.txt b/requirements/requirements.txt index ca05c71b7..9db02dc49 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -42,3 +42,8 @@ XlsxWriter==3.2.0 setuptools==80.9.0 git+https://github.com/interlegis/django-admin-bootstrapped + +# Redis cache backend (Phase 1 — shared rate-limiter state). +# 4.12.1 is the last release that explicitly supports Django 2.2. +# Upgrade to 5.x when the project moves to Django 3.2+. +django-redis==4.12.1 diff --git a/sapl/settings.py b/sapl/settings.py index ee7669263..c4fc7987b 100644 --- a/sapl/settings.py +++ b/sapl/settings.py @@ -201,14 +201,67 @@ SPECTACULAR_SETTINGS = { 'VERSION': '1.0.0', } +# --------------------------------------------------------------------------- +# Cache — switches between file-based (default) and Redis at pod startup. +# REDIS_URL and CACHE_BACKEND are resolved by start.sh before Gunicorn +# starts; settings.py reads them as env vars (written into .env). +# --------------------------------------------------------------------------- +REDIS_URL = config('REDIS_URL', default='') +CACHE_BACKEND = config('CACHE_BACKEND', default='file') + +_redis_ready = CACHE_BACKEND == 'redis' and bool(REDIS_URL) + +_redis_pool = { + 'max_connections': 6, # 1,200 pods × 2 workers × 6 = 14,400 peak + 'socket_timeout': 0.5, + 'socket_connect_timeout': 0.5, +} + CACHES = { + # DB0 — page / view / static-file cache 'default': { - 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', - 'LOCATION': '/var/tmp/django_cache', - 'OPTIONS': {"MAX_ENTRIES": 10000}, - } + 'BACKEND': ( + 'django_redis.cache.RedisCache' if _redis_ready + else 'django.core.cache.backends.filebased.FileBasedCache' + ), + 'LOCATION': REDIS_URL + '/0' if _redis_ready else '/var/tmp/django_cache', + 'KEY_PREFIX': 'sapl', + **( + { + 'OPTIONS': { + 'CLIENT_CLASS': 'django_redis.client.DefaultClient', + 'CONNECTION_POOL_KWARGS': _redis_pool, + 'IGNORE_EXCEPTIONS': True, # degrades to cache miss on failure + }, + 'TIMEOUT': 300, + } if _redis_ready else { + 'OPTIONS': {'MAX_ENTRIES': 10000}, + } + ), + }, + # DB1 — rate-limiter counters (raw keys, no KEY_PREFIX) + 'ratelimit': { + 'BACKEND': ( + 'django_redis.cache.RedisCache' if _redis_ready + else 'django.core.cache.backends.filebased.FileBasedCache' + ), + 'LOCATION': REDIS_URL + '/1' if _redis_ready else '/var/tmp/django_ratelimit_cache', + **( + { + 'OPTIONS': { + 'CLIENT_CLASS': 'django_redis.client.DefaultClient', + 'CONNECTION_POOL_KWARGS': _redis_pool, + 'IGNORE_EXCEPTIONS': True, + }, + } if _redis_ready else { + 'OPTIONS': {'MAX_ENTRIES': 5000}, + } + ), + }, } +RATELIMIT_USE_CACHE = 'ratelimit' + ROOT_URLCONF = 'sapl.urls' FORM_RENDERER = 'django.forms.renderers.TemplatesSetting'