diff --git a/plan/RATE-LIMITER-PLAN.md b/plan/RATE-LIMITER-PLAN.md index 458df0014..6ea40ea87 100644 --- a/plan/RATE-LIMITER-PLAN.md +++ b/plan/RATE-LIMITER-PLAN.md @@ -105,8 +105,10 @@ graph TD | Static cache (images/logos) | `static:{ns}:{sha256}` | 3–24 h | 0 | ~2.4 GB | | IP request counter | `rl:ip:{ip}:reqs` | 60 s | 1 | ~0.6 MB | | IP blocked marker | `rl:ip:{ip}:blocked` | 300 s | 1 | ~0.06 MB | +| Blocked-IP index | `rl:index:blocked_ips` | permanent ZSET | 1 | ~0.01 MB | | User request counter | `rl:{ns}:user:{uid}:reqs` | 60 s | 1 | negligible | | User blocked marker | `rl:{ns}:user:{uid}:blocked` | 300 s | 1 | negligible | +| Blocked-user index | `rl:index:blocked_users` | permanent ZSET | 1 | negligible | | Path counter | `rl:{ns}:path:{sha256}:reqs` | 60 s | 1 | ~0.3 MB | | UA deny list | `rl:bot:ua:blocked` | permanent SET | 1 | ~0.03 MB | | NS/IP/window counter | `rl:{ns}:ip:{ip}:w:{bucket}` | 120 s | 1 | ~0.6 MB | @@ -353,7 +355,7 @@ rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ redis-cli -n 1 --scan --pattern 'rl:ip:*' | head -20 -# All currently blocked IPs +# All currently blocked IPs (legacy SCAN — use ZSET index below instead) rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ redis-cli -n 1 --scan --pattern 'rl:ip:*:blocked' ``` @@ -361,7 +363,21 @@ rancher kubectl exec -n sapl-redis deploy/sapl-redis -- \ Via port-forward (local machine — run `kubectl port-forward svc/redis -n sapl-redis 6379:6379` first): ```bash -# All blocked IPs with value and remaining TTL +# All active blocked IPs via ZSET index (O(log N), no SCAN) +NOW=$(date +%s) +redis-cli -n 1 ZRANGEBYSCORE rl:index:blocked_ips $NOW +inf WITHSCORES + +# All active blocked users via ZSET index +redis-cli -n 1 ZRANGEBYSCORE rl:index:blocked_users $NOW +inf WITHSCORES + +# Count of currently active blocked IPs +redis-cli -n 1 ZCOUNT rl:index:blocked_ips $NOW +inf + +# Prune expired entries from both indexes (safe to run anytime) +redis-cli -n 1 ZREMRANGEBYSCORE rl:index:blocked_ips 0 $((NOW - 1)) +redis-cli -n 1 ZREMRANGEBYSCORE rl:index:blocked_users 0 $((NOW - 1)) + +# Legacy: blocked IPs with value and remaining TTL (still works; slower on large key spaces) redis-cli -n 1 --scan --pattern 'rl:ip:*:blocked' | while read key; do echo "$key → $(redis-cli -n 1 GET $key) (TTL: $(redis-cli -n 1 TTL $key)s)" done @@ -615,9 +631,12 @@ Requests that pass nginx reach Python. The middleware counts them in a | `RATE_LIMITER_RATE_BOT` | `5/m` | *(reserved — bots are currently blocked outright, not counted)* | | `RATE_LIMITER_UA_BLOCKLIST_REFRESH` | `60` s | How often each worker re-fetches `rl:bot:ua:blocked` from Redis | -When the window count hits the threshold the IP/user is written to a Redis -blocked-set with a 300 s TTL and subsequent requests return 429 with -`Retry-After: 300` — without touching the database. +When the window count hits the threshold the IP/user block key is written +atomically (Lua: `SET key 1 EX 300` + `ZADD index score key`) with a 300 s TTL +and subsequent requests return 429 with `Retry-After: 300` — without touching +the database. The ZADD records the full key name in `rl:index:blocked_ips` or +`rl:index:blocked_users` with score = expiry unix timestamp, enabling O(log N) +enumeration of all active blocks without a `SCAN`. Decision flow inside `RateLimitMiddleware.__call__()` / `_evaluate()`: @@ -872,8 +891,10 @@ Redis PDF caching would solve "high request volume reaching the file layer" — | 1 | IP rate-limit counter | `rl:ip:{ip}:reqs` | 60 s | 35 (`RATE_LIMITER_RATE`) | `RL_IP_REQUESTS` | | 1 | IP 404 counter | `rl:ip:{ip}:404s` | 60 s | 10 (`RATE_LIMIT_404_THRESHOLD`) | `RL_IP_404S` | | 1 | IP blocked marker | `rl:ip:{ip}:blocked` | 300 s | — | `RL_IP_BLOCKED` | +| 1 | Blocked-IP ZSET index | `rl:index:blocked_ips` | permanent ZSET, score=expiry ts | — | `RL_INDEX_BLOCKED_IPS` | | 1 | User rate-limit counter | `rl:{ns}:user:{uid}:reqs` | 60 s | 120 (`RATE_LIMITER_RATE_AUTHENTICATED`) | `RL_USER_REQUESTS` | | 1 | User blocked marker | `rl:{ns}:user:{uid}:blocked` | 300 s | — | `RL_USER_BLOCKED` | +| 1 | Blocked-user ZSET index | `rl:index:blocked_users` | permanent ZSET, score=expiry ts | — | `RL_INDEX_BLOCKED_USERS` | | 1 | Namespace/IP sliding window | `rl:{ns}:ip:{ip}:w:{bucket}` | 120 s | 35 (`RATE_LIMITER_RATE`) | `RL_NS_WINDOW` | | 1 | Path counter (`/media/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | — (observability only) | `RL_PATH_REQUESTS` | | 1 | Path counter (`/static/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | — | *Future* (requires OpenResty/Lua) | @@ -984,6 +1005,16 @@ pre-warming or public interest event). --- +**`rl:index:blocked_ips` / `rl:index:blocked_users` — ZSET enumeration indexes** + +Written atomically alongside every block-key write via `_BLOCK_LUA` (Lua: `SET key 1 EX ttl` + `ZADD index expire_ts key`). Score = unix expiry timestamp. + +Catches: gives monitoring and admin tooling an O(log N) view of all active blocks — `ZRANGEBYSCORE index +inf` — without a fleet-wide `SCAN` that would block Redis during large key spaces. Also enables fast `ZCOUNT` for alerting on block-rate spikes. + +Misses: stale entries (blocks that expired naturally) accumulate in the ZSET because Redis does not auto-remove ZSET members when the referenced key expires. Prune periodically with `ZREMRANGEBYSCORE index 0 `. The fallback path (Redis unavailable) skips the ZADD — the actual block key is still set via `cache.set`, but the index entry is lost for that event. + +--- + **`rl:bot:ua:blocked` — runtime UA deny list** Catches: new bot UA tokens added at runtime via `redis-cli SADD` without a code diff --git a/sapl/middleware/ratelimit.py b/sapl/middleware/ratelimit.py index bb93ad6e0..d842b69b1 100644 --- a/sapl/middleware/ratelimit.py +++ b/sapl/middleware/ratelimit.py @@ -60,6 +60,12 @@ RL_PATH_REQUESTS = 'rl:{ns}:path:{sha256}:reqs' RL_UA_BLOCKLIST = 'rl:bot:ua:blocked' # permanent SET — runtime UA deny list RL_METRICS_BLOCKED = 'rl:metrics:{ns}:{date}:blocked:{reason}' # daily counter per block reason +# ZSET indexes — members are full block-key strings, score = expiry unix timestamp. +# Lets admin/monitoring tools enumerate active blocks with a single ZRANGEBYSCORE +# without scanning all keys. Prunable via: ZREMRANGEBYSCORE 0 . +RL_INDEX_BLOCKED_IPS = 'rl:index:blocked_ips' +RL_INDEX_BLOCKED_USERS = 'rl:index:blocked_users' + # --------------------------------------------------------------------------- # API quota keys — per-consumer, per-day/week, tenant-scoped. # Consumer identity: authenticated users by uid, anonymous by masked IP. @@ -97,6 +103,15 @@ _INCR_LUA = """ return n """ +# Atomically write a block key and record it in the ZSET index in one round-trip. +# KEYS[1] = block key KEYS[2] = index key +# ARGV[1] = ttl (seconds) ARGV[2] = expiry unix timestamp (now + ttl) +_BLOCK_LUA = """ + redis.call('SET', KEYS[1], '1', 'EX', ARGV[1]) + redis.call('ZADD', KEYS[2], ARGV[2], KEYS[1]) + return 1 +""" + def make_ratelimit_cache_key(key, key_prefix, version): """ @@ -209,6 +224,22 @@ def _incr_with_ttl(key, ttl): return count +def _set_block(block_key, index_key, ttl): + """ + Atomically set a block key (with TTL) and record it in a ZSET index. + Score = expiry unix timestamp so the index can be pruned with + ZREMRANGEBYSCORE 0 . + Falls back to a plain cache.set when Redis is unavailable (index skipped). + """ + expire_at = int(time.time()) + ttl + try: + from django_redis import get_redis_connection + client = get_redis_connection('ratelimit') + client.eval(_BLOCK_LUA, 2, block_key, index_key, ttl, expire_at) + except Exception: + caches['ratelimit'].set(block_key, 1, timeout=ttl) + + class RateLimitMiddleware: BLOCK_TTL = 300 # seconds an IP/user stays blocked after threshold breach @@ -340,8 +371,10 @@ class RateLimitMiddleware: RL_USER_REQUESTS.format(ns=_NAMESPACE, uid=uid), ttl=self.auth_window ) if count >= self.auth_threshold: - self._rl_cache.set( - RL_USER_BLOCKED.format(ns=_NAMESPACE, uid=uid), 1, timeout=self.BLOCK_TTL + _set_block( + RL_USER_BLOCKED.format(ns=_NAMESPACE, uid=uid), + RL_INDEX_BLOCKED_USERS, + self.BLOCK_TTL, ) return {'action': 'block', 'reason': 'auth_user_rate', 'ip': ip} @@ -355,7 +388,7 @@ class RateLimitMiddleware: # Check 4b: IP request rate count = self._incr_with_ttl(RL_IP_REQUESTS.format(ip=ip), ttl=self.anon_window) if count >= self.anon_threshold: - self._rl_cache.set(RL_IP_BLOCKED.format(ip=ip), 1, timeout=self.BLOCK_TTL) + _set_block(RL_IP_BLOCKED.format(ip=ip), RL_INDEX_BLOCKED_IPS, self.BLOCK_TTL) return {'action': 'block', 'reason': 'ip_rate', 'ip': ip} # Check 4c: per-namespace/IP/window (catches UA rotators behind NAT) @@ -365,7 +398,7 @@ class RateLimitMiddleware: ttl=self.anon_window * 2, ) if count >= self.anon_threshold: - self._rl_cache.set(RL_IP_BLOCKED.format(ip=ip), 1, timeout=self.BLOCK_TTL) + _set_block(RL_IP_BLOCKED.format(ip=ip), RL_INDEX_BLOCKED_IPS, self.BLOCK_TTL) return {'action': 'block', 'reason': 'ua_rotation', 'ip': ip} return {'action': 'pass', 'ip': ip} @@ -389,7 +422,7 @@ class RateLimitMiddleware: return count = self._incr_with_ttl(RL_IP_404S.format(ip=ip), ttl=self.anon_window) if count >= self.not_found_threshold: - self._rl_cache.set(RL_IP_BLOCKED.format(ip=ip), 1, timeout=self.BLOCK_TTL) + _set_block(RL_IP_BLOCKED.format(ip=ip), RL_INDEX_BLOCKED_IPS, self.BLOCK_TTL) logger.warning( 'ratelimit_block layer=django reason=404_scan ip=%s path=%s namespace=%s', ip, request.path, _NAMESPACE, diff --git a/sapl/settings.py b/sapl/settings.py index 781d86cbf..7c3d834ce 100644 --- a/sapl/settings.py +++ b/sapl/settings.py @@ -243,7 +243,7 @@ def _build_cache_layer(pod_namespace, cache_backend, redis_url): """ if cache_backend == 'redis' and bool(redis_url): _pool = { - 'max_connections': 6, # 1,200 pods × 2 workers × 6 = 14,400 peak + 'max_connections': 6, # 1,200 pods × 2 workers × 6 = 14,400 peak 'socket_timeout': 0.5, 'socket_connect_timeout': 0.5, } @@ -439,9 +439,9 @@ RATE_LIMIT_BYPASS_PATHS = [ # Anon quota is tighter than auth quota — mirrors the rate limiter relationship. # Both must be > their respective per-minute rate limit thresholds (35 anon, 120 auth), # otherwise the quota fires before the rate limiter ever engages. -API_QUOTA_ANON_DAILY = config('API_QUOTA_ANON_DAILY', default=50, cast=int) +API_QUOTA_ANON_DAILY = config('API_QUOTA_ANON_DAILY', default=50, cast=int) API_QUOTA_ANON_WEEKLY = config('API_QUOTA_ANON_WEEKLY', default=350, cast=int) -API_QUOTA_AUTH_DAILY = config('API_QUOTA_AUTH_DAILY', default=1000, cast=int) +API_QUOTA_AUTH_DAILY = config('API_QUOTA_AUTH_DAILY', default=1000, cast=int) API_QUOTA_AUTH_WEEKLY = config('API_QUOTA_AUTH_WEEKLY', default=7000, cast=int) # Media file serving — serve_media (sapl/base/media.py) via X-Accel-Redirect. @@ -512,6 +512,7 @@ pg_utils.utc_tzinfo_factory = _compat_utc_tzinfo_factory ## America/Sao_Paulo), so the suffix is constant noise that bloats key names. ## import django.utils.cache as _dj_cache + _dj_cache._i18n_cache_key_suffix = lambda request, cache_key: cache_key # DATE_FORMAT = 'N j, Y'