From 12f6a3e396f55f8675da5d4af1ad3d0867145614 Mon Sep 17 00:00:00 2001
From: Edward Oliveira <edwardr@senado.gov.br>
Date: Mon, 13 Apr 2026 23:10:58 -0300
Subject: [PATCH] =?UTF-8?q?Phase=201:=20shared=20Redis=20pod=20=E2=80=94?=
 =?UTF-8?q?=20Django=20dual-backend=20cache=20+=20startup=20wiring?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

django/settings.py:
- REDIS_URL / CACHE_BACKEND env vars read at startup (written by start.sh)
- CACHES['default'] (DB0, KEY_PREFIX='sapl') switches between django-redis
  and FileBasedCache transparently; IGNORE_EXCEPTIONS=True for graceful
  degradation on Redis failure
- CACHES['ratelimit'] (DB1, no prefix) for cross-pod rate-limit counters
- RATELIMIT_USE_CACHE = 'ratelimit'
- Connection pool capped at 6/worker (1,200 pods × 2 workers × 6 = 14,400
  peak connections; maxclients=20,000 gives 40% headroom)

start.sh:
- resolve_redis_url(): reads REDIS_URL from local namespace Secret (envFrom)
  or falls back to global cluster Secret via k8s API
- configure_redis_cache(): ensures REDIS_CACHE waffle switch row exists (off)
- resolve_cache_backend(): reads waffle switch; sets CACHE_BACKEND=redis|file
- wait_for_redis(): blocks until Redis reachable; falls back gracefully
- write_env_file() now persists REDIS_URL + CACHE_BACKEND into pod .env

k8s manifests (docker/k8s/):
- redis-configmap.yaml: no persistence, allkeys-lru, maxmemory=5gb,
  maxclients=20000, activedefrag, 4 databases
- redis-deployment.yaml: redis:7-alpine, 1 replica, liveness/readiness probes,
  1Gi request / 6Gi limit
- redis-service.yaml: ClusterIP on port 6379

requirements: add django-redis==5.4.0

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docker/k8s/README.md             | 228 +++++++++++++++++++++++++++++++
 docker/k8s/redis-configmap.yaml  |  35 +++++
 docker/k8s/redis-deployment.yaml |  48 +++++++
 docker/k8s/redis-service.yaml    |  15 ++
 docker/startup_scripts/start.sh  |  92 ++++++++++++-
 requirements/requirements.txt    |   5 +
 sapl/settings.py                 |  61 ++++++++-
 7 files changed, 479 insertions(+), 5 deletions(-)
 create mode 100644 docker/k8s/README.md
 create mode 100644 docker/k8s/redis-configmap.yaml
 create mode 100644 docker/k8s/redis-deployment.yaml
 create mode 100644 docker/k8s/redis-service.yaml
diff --git a/docker/k8s/README.md b/docker/k8s/README.md
new file mode 100644
index 000000000..34d90b64c
--- /dev/null
+++ b/docker/k8s/README.md
@@ -0,0 +1,228 @@
+# SAPL — Kubernetes Redis
+
+Manifests for the shared Redis instance used by all SAPL pods for
+cross-pod rate limiting (DB 1) and view/static-file caching (DB 0).
+
+---
+
+## Directory layout
+
+```
+docker/k8s/
+├── redis-configmap.yaml    # redis.conf — no persistence, allkeys-lru, 5 GB ceiling
+├── redis-deployment.yaml   # Deployment (1 replica, redis:7-alpine)
+├── redis-service.yaml      # ClusterIP service on port 6379
+└── README.md               # this file
+```
+
+---
+
+## Prerequisites
+
+- `kubectl` configured to talk to the target cluster.
+- A `redis` namespace (created below if it doesn't exist).
+
+---
+
+## Deploy
+
+```bash
+# 1. Create the namespace (idempotent)
+kubectl create namespace redis --dry-run=client -o yaml | kubectl apply -f -
+
+# 2. Apply all three manifests
+kubectl apply -f docker/k8s/redis-configmap.yaml
+kubectl apply -f docker/k8s/redis-deployment.yaml
+kubectl apply -f docker/k8s/redis-service.yaml
+
+# 3. Verify the pod is Running
+kubectl -n redis get pods -l app=sapl-redis
+```
+
+Expected output:
+```
+NAME                          READY   STATUS    RESTARTS   AGE
+sapl-redis-6d9f8b7c4d-xk2lm   1/1     Running   0          30s
+```
+
+---
+
+## Wire a SAPL namespace to Redis
+
+```bash
+# Create the per-namespace Secret (one-off per tenant)
+kubectl create secret generic sapl-redis \
+  --namespace=<NAMESPACE> \
+  --from-literal=REDIS_URL="redis://sapl-redis.redis.svc.cluster.local:6379" \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+# Ensure the waffle switch row exists (starts OFF)
+kubectl exec -n <NAMESPACE> deploy/sapl -- \
+  python manage.py waffle_switch REDIS_CACHE off --create
+
+# Enable Redis for this namespace
+kubectl exec -n <NAMESPACE> deploy/sapl -- \
+  python manage.py waffle_switch REDIS_CACHE on
+
+# Rolling restart so start.sh picks up the new switch value
+kubectl rollout restart deployment/sapl -n <NAMESPACE>
+kubectl rollout status  deployment/sapl -n <NAMESPACE>
+```
+
+### Fleet-wide rollout
+
+```bash
+kubectl get namespaces -l app=sapl -o name | sed 's|namespace/||' | \
+  xargs -P 10 -I{} kubectl exec -n {} deploy/sapl -- \
+    python manage.py waffle_switch REDIS_CACHE on --create
+
+kubectl get namespaces -l app=sapl -o name | sed 's|namespace/||' | \
+  xargs -P 5 -I{} kubectl rollout restart deployment/sapl -n {}
+```
+
+### Roll back (without removing the Secret)
+
+```bash
+kubectl exec -n <NAMESPACE> deploy/sapl -- \
+  python manage.py waffle_switch REDIS_CACHE off
+kubectl rollout restart deployment/sapl -n <NAMESPACE>
+```
+
+---
+
+## Monitor
+
+### Pod and events
+
+```bash
+# Pod status
+kubectl -n redis get pods -l app=sapl-redis -o wide
+
+# Deployment events (useful right after apply)
+kubectl -n redis describe deployment sapl-redis
+
+# Pod events (OOMKill, restarts, etc.)
+kubectl -n redis describe pod -l app=sapl-redis
+```
+
+### Logs
+
+```bash
+# Tail live logs
+kubectl -n redis logs -f deploy/sapl-redis
+
+# Last 100 lines
+kubectl -n redis logs deploy/sapl-redis --tail=100
+```
+
+### Redis INFO
+
+```bash
+# Memory usage
+kubectl exec -n redis deploy/sapl-redis -- \
+  redis-cli info memory \
+  | grep -E 'used_memory_human|maxmemory_human|mem_fragmentation_ratio'
+
+# Connection pressure
+kubectl exec -n redis deploy/sapl-redis -- \
+  redis-cli info stats \
+  | grep -E 'rejected_connections|instantaneous_ops_per_sec'
+
+# Key distribution per DB
+kubectl exec -n redis deploy/sapl-redis -- redis-cli info keyspace
+
+# Recent slow queries
+kubectl exec -n redis deploy/sapl-redis -- redis-cli slowlog get 10
+
+# Live command sampling (1-second window)
+kubectl exec -n redis deploy/sapl-redis -- redis-cli --latency-history -i 1
+```
+
+### Rate-limiter keys (DB 1)
+
+```bash
+kubectl exec -n redis deploy/sapl-redis -- \
+  redis-cli -n 1 dbsize
+
+kubectl exec -n redis deploy/sapl-redis -- \
+  redis-cli -n 1 --scan --pattern 'rl:ip:*' | head -20
+```
+
+---
+
+## Seed the UA deny list (once after first deploy)
+
+```bash
+kubectl exec -n redis deploy/sapl-redis -- redis-cli -n 1 \
+  SADD rl:bot:ua:blocked \
+    "$(echo -n 'GPTBot'             | sha256sum | cut -d' ' -f1)" \
+    "$(echo -n 'ClaudeBot'          | sha256sum | cut -d' ' -f1)" \
+    "$(echo -n 'PerplexityBot'      | sha256sum | cut -d' ' -f1)" \
+    "$(echo -n 'Bytespider'         | sha256sum | cut -d' ' -f1)" \
+    "$(echo -n 'AhrefsBot'          | sha256sum | cut -d' ' -f1)" \
+    "$(echo -n 'meta-externalagent' | sha256sum | cut -d' ' -f1)"
+
+# Add a new offender at runtime (no restart required)
+kubectl exec -n redis deploy/sapl-redis -- redis-cli -n 1 \
+  SADD rl:bot:ua:blocked "$(echo -n 'NewBot/1.0' | sha256sum | cut -d' ' -f1)"
+```
+
+---
+
+## Local standalone Redis (development / testing)
+
+No Kubernetes? Run Redis directly with Docker:
+
+```bash
+sudo docker run --rm -p 6379:6379 redis:7-alpine \
+  redis-server --save "" --appendonly no
+```
+
+Then point Django at it by exporting the env var before starting the dev server:
+
+```bash
+export REDIS_URL="redis://localhost:6379"
+export CACHE_BACKEND="redis"
+python manage.py runserver
+```
+
+Or add them to your local `.env` file:
+
+```
+REDIS_URL=redis://localhost:6379
+CACHE_BACKEND=redis
+```
+
+> **Note**: the waffle switch `REDIS_CACHE` must also be `on` in your local
+> database for `start.sh` to activate the Redis backend. Run:
+> ```bash
+> python manage.py waffle_switch REDIS_CACHE on --create
+> ```
+
+---
+
+## Update `redis.conf` without redeploying
+
+```bash
+# Edit the ConfigMap
+kubectl -n redis edit configmap redis-config
+
+# Restart the pod to pick up the new config
+kubectl -n redis rollout restart deployment/sapl-redis
+```
+
+---
+
+## Key schema reference
+
+| DB | Use case | Key pattern | TTL |
+|----|----------|-------------|-----|
+| 0 | Page / view cache | `sapl:cache:*` | 60 – 3 600 s |
+| 0 | Static file cache (logos) | `static:{ns}:{sha256}` | 3 – 24 h |
+| 0 | PDF cache (≤ 360 KB) | `file:{ns}:{sha256}` | 1 h |
+| 1 | IP rate-limit counter | `rl:ip:{ip}:reqs` | 60 s |
+| 1 | IP blocked marker | `rl:ip:{ip}:blocked` | 300 s |
+| 1 | User rate-limit counter | `rl:{ns}:user:{id}:reqs` | 60 s |
+| 1 | Path counter | `rl:{ns}:path:{sha256}:reqs` | 60 s |
+| 1 | UA deny list | `rl:bot:ua:blocked` | permanent SET |
+| 2 | Django Channels (future) | `channels:*` | session TTL |
diff --git a/docker/k8s/redis-configmap.yaml b/docker/k8s/redis-configmap.yaml
new file mode 100644
index 000000000..372d58975
--- /dev/null
+++ b/docker/k8s/redis-configmap.yaml
@@ -0,0 +1,35 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: redis-config
+  namespace: redis
+data:
+  redis.conf: |
+    save ""
+    appendonly no
+
+    maxmemory 5gb
+    maxmemory-policy allkeys-lru
+    maxmemory-samples 10
+
+    maxclients 20000
+    tcp-backlog 511
+    timeout 300
+    tcp-keepalive 60
+
+    hz 20
+    lazyfree-lazy-eviction yes
+    lazyfree-lazy-expire yes
+    lazyfree-lazy-server-del yes
+
+    slowlog-log-slower-than 10000
+    slowlog-max-len 256
+    latency-monitor-threshold 10
+
+    bind 0.0.0.0
+    protected-mode no
+    databases 4     # DB0: cache, DB1: rate limiter, DB2: channels (future)
+
+    activedefrag yes
+    active-defrag-ignore-bytes 100mb
+    active-defrag-threshold-lower 10
diff --git a/docker/k8s/redis-deployment.yaml b/docker/k8s/redis-deployment.yaml
new file mode 100644
index 000000000..732faff11
--- /dev/null
+++ b/docker/k8s/redis-deployment.yaml
@@ -0,0 +1,48 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: sapl-redis
+  namespace: redis
+  labels:
+    app: sapl-redis
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: sapl-redis
+  template:
+    metadata:
+      labels:
+        app: sapl-redis
+    spec:
+      containers:
+      - name: redis
+        image: redis:7-alpine
+        command: ["redis-server", "/etc/redis/redis.conf"]
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "250m"
+          limits:
+            memory: "6Gi"
+            cpu: "1000m"
+        ports:
+        - containerPort: 6379
+        livenessProbe:
+          exec:
+            command: ["redis-cli", "ping"]
+          initialDelaySeconds: 10
+          periodSeconds: 15
+          failureThreshold: 3
+        readinessProbe:
+          exec:
+            command: ["redis-cli", "ping"]
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        volumeMounts:
+        - name: redis-config
+          mountPath: /etc/redis
+      volumes:
+      - name: redis-config
+        configMap:
+          name: redis-config
diff --git a/docker/k8s/redis-service.yaml b/docker/k8s/redis-service.yaml
new file mode 100644
index 000000000..8e4fcd3e9
--- /dev/null
+++ b/docker/k8s/redis-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: sapl-redis
+  namespace: redis
+  labels:
+    app: sapl-redis
+spec:
+  selector:
+    app: sapl-redis
+  ports:
+  - name: redis
+    port: 6379
+    targetPort: 6379
+  type: ClusterIP
diff --git a/docker/startup_scripts/start.sh b/docker/startup_scripts/start.sh
index bd98bdfc0..2b39cef82 100755
--- a/docker/startup_scripts/start.sh
+++ b/docker/startup_scripts/start.sh
@@ -104,6 +104,8 @@ write_env_file() {
   : "${RF:=1}"
   : "${MAX_SHARDS_PER_NODE:=1}"
   : "${ENABLE_SAPN:=False}"
+  : "${REDIS_URL:=}"
+  : "${CACHE_BACKEND:=file}"
 
   tmp="$(mktemp)"
   {
@@ -126,6 +128,8 @@ write_env_file() {
     printf 'RF=%s\n' "$RF"
     printf 'MAX_SHARDS_PER_NODE=%s\n' "$MAX_SHARDS_PER_NODE"
     printf 'ENABLE_SAPN=%s\n' "$ENABLE_SAPN"
+    printf 'REDIS_URL=%s\n' "$REDIS_URL"
+    printf 'CACHE_BACKEND=%s\n' "$CACHE_BACKEND"
   } > "$tmp"
 
   chmod 600 "$tmp"
@@ -256,6 +260,88 @@ setup_cache_dir() {
   umask 0007
 }
 
+# ---------------------------------------------------------------------------
+# Redis — resolve URL, check waffle switch, wait for connectivity
+# ---------------------------------------------------------------------------
+
+# 1. Populate REDIS_URL from local Secret (envFrom) or fall back to global
+#    cluster Secret read via the k8s API.
+resolve_redis_url() {
+  # Already injected by pod's envFrom (local namespace Secret) — highest precedence.
+  [[ -n "${REDIS_URL:-}" ]] && { log "REDIS_URL from local secret."; return 0; }
+
+  # Try the global cluster Secret via the k8s in-cluster API.
+  local api="https://kubernetes.default.svc"
+  local token_file="/var/run/secrets/kubernetes.io/serviceaccount/token"
+  local ca="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+
+  [[ -f "$token_file" ]] || { log "No k8s service-account token — skipping global Redis secret."; return 0; }
+
+  local token url
+  token="$(<"$token_file")"
+  url=$(curl -sf --cacert "$ca" \
+      -H "Authorization: Bearer $token" \
+      "${api}/api/v1/namespaces/interlegis-infra/secrets/sapl-global-redis" \
+    | python3 -c "
+import sys, json, base64
+d = json.load(sys.stdin).get('data', {})
+v = d.get('REDIS_URL', '')
+print(base64.b64decode(v).decode() if v else '')
+" 2>/dev/null || echo "")
+
+  if [[ -n "$url" ]]; then
+    export REDIS_URL="$url"
+    log "REDIS_URL from global cluster secret."
+  else
+    log "No REDIS_URL found — file-based cache will be used."
+  fi
+}
+
+# 2. Check the REDIS_CACHE waffle switch; set CACHE_BACKEND accordingly.
+resolve_cache_backend() {
+  [[ -z "${REDIS_URL:-}" ]] && return 0
+  log "REDIS_URL set — checking REDIS_CACHE waffle switch..."
+  local active
+  active=$(psql "$DATABASE_URL" -At -v ON_ERROR_STOP=0 \
+      -c "SELECT active FROM waffle_switch WHERE name='REDIS_CACHE' LIMIT 1;" \
+      2>/dev/null || echo "")
+  if [[ "$active" == "t" ]]; then
+    export CACHE_BACKEND="redis"
+    log "REDIS_CACHE switch ON — activating Redis cache backend."
+  else
+    export CACHE_BACKEND="file"
+    log "REDIS_CACHE switch OFF — using file-based cache."
+  fi
+}
+
+# 3. Ensure the REDIS_CACHE waffle switch row exists (default: off).
+configure_redis_cache() {
+  [[ -z "${REDIS_URL:-}" ]] && return 0
+  log "Ensuring REDIS_CACHE waffle switch exists (default: off)..."
+  python3 manage.py waffle_switch REDIS_CACHE off --create || true
+}
+
+# 4. Block until Redis is reachable (or give up gracefully).
+wait_for_redis() {
+  [[ -z "${REDIS_URL:-}" ]] && return 0
+  [[ "${CACHE_BACKEND:-file}" != "redis" ]] && return 0
+  log "Checking Redis connectivity..."
+  local host port retries=10
+  host=$(python3 -c "from urllib.parse import urlparse; u=urlparse('${REDIS_URL}'); print(u.hostname or 'localhost')")
+  port=$(python3 -c "from urllib.parse import urlparse; u=urlparse('${REDIS_URL}'); print(u.port or 6379)")
+  until python3 -c "import socket; s=socket.create_connection(('$host',$port),2); s.close()" 2>/dev/null; do
+    retries=$((retries - 1))
+    if [[ $retries -eq 0 ]]; then
+      log "WARNING: Redis unreachable after retries — falling back to file cache."
+      export CACHE_BACKEND="file"
+      return 0
+    fi
+    log "Waiting for Redis at $host:$port... ($retries retries left)"
+    sleep 2
+  done
+  log "Redis reachable at $host:$port."
+}
+
 start_services() {
   log "Starting gunicorn..."
   gunicorn -c gunicorn.conf.py &
@@ -265,10 +351,14 @@ start_services() {
 
 main() {
   create_secret
-  write_env_file
+  resolve_redis_url
   wait_for_pg
   configure_pg_timezone
   migrate_db
+  configure_redis_cache
+  resolve_cache_backend
+  wait_for_redis
+  write_env_file          # writes resolved REDIS_URL + CACHE_BACKEND into .env
   configure_solr || true
   configure_sapn
   create_admin
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index ca05c71b7..9db02dc49 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -42,3 +42,8 @@ XlsxWriter==3.2.0
 setuptools==80.9.0
 
 git+https://github.com/interlegis/django-admin-bootstrapped
+
+# Redis cache backend (Phase 1 — shared rate-limiter state).
+# 4.12.1 is the last release that explicitly supports Django 2.2.
+# Upgrade to 5.x when the project moves to Django 3.2+.
+django-redis==4.12.1
diff --git a/sapl/settings.py b/sapl/settings.py
index ee7669263..c4fc7987b 100644
--- a/sapl/settings.py
+++ b/sapl/settings.py
@@ -201,14 +201,67 @@ SPECTACULAR_SETTINGS = {
     'VERSION': '1.0.0',
 }
 
+# ---------------------------------------------------------------------------
+# Cache — switches between file-based (default) and Redis at pod startup.
+# REDIS_URL and CACHE_BACKEND are resolved by start.sh before Gunicorn
+# starts; settings.py reads them as env vars (written into .env).
+# ---------------------------------------------------------------------------
+REDIS_URL     = config('REDIS_URL',     default='')
+CACHE_BACKEND = config('CACHE_BACKEND', default='file')
+
+_redis_ready = CACHE_BACKEND == 'redis' and bool(REDIS_URL)
+
+_redis_pool = {
+    'max_connections': 6,          # 1,200 pods × 2 workers × 6 = 14,400 peak
+    'socket_timeout': 0.5,
+    'socket_connect_timeout': 0.5,
+}
+
 CACHES = {
+    # DB0 — page / view / static-file cache
     'default': {
-        'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
-        'LOCATION': '/var/tmp/django_cache',
-        'OPTIONS': {"MAX_ENTRIES": 10000},
-    }
+        'BACKEND': (
+            'django_redis.cache.RedisCache' if _redis_ready
+            else 'django.core.cache.backends.filebased.FileBasedCache'
+        ),
+        'LOCATION': REDIS_URL + '/0' if _redis_ready else '/var/tmp/django_cache',
+        'KEY_PREFIX': 'sapl',
+        **(
+            {
+                'OPTIONS': {
+                    'CLIENT_CLASS': 'django_redis.client.DefaultClient',
+                    'CONNECTION_POOL_KWARGS': _redis_pool,
+                    'IGNORE_EXCEPTIONS': True,  # degrades to cache miss on failure
+                },
+                'TIMEOUT': 300,
+            } if _redis_ready else {
+                'OPTIONS': {'MAX_ENTRIES': 10000},
+            }
+        ),
+    },
+    # DB1 — rate-limiter counters (raw keys, no KEY_PREFIX)
+    'ratelimit': {
+        'BACKEND': (
+            'django_redis.cache.RedisCache' if _redis_ready
+            else 'django.core.cache.backends.filebased.FileBasedCache'
+        ),
+        'LOCATION': REDIS_URL + '/1' if _redis_ready else '/var/tmp/django_ratelimit_cache',
+        **(
+            {
+                'OPTIONS': {
+                    'CLIENT_CLASS': 'django_redis.client.DefaultClient',
+                    'CONNECTION_POOL_KWARGS': _redis_pool,
+                    'IGNORE_EXCEPTIONS': True,
+                },
+            } if _redis_ready else {
+                'OPTIONS': {'MAX_ENTRIES': 5000},
+            }
+        ),
+    },
 }
 
+RATELIMIT_USE_CACHE = 'ratelimit'
+
 ROOT_URLCONF = 'sapl.urls'
 
 FORM_RENDERER = 'django.forms.renderers.TemplatesSetting'