diff --git a/plan/RATE-LIMITER-PLAN.md b/plan/RATE-LIMITER-PLAN.md index 2df03733d..396f83595 100644 --- a/plan/RATE-LIMITER-PLAN.md +++ b/plan/RATE-LIMITER-PLAN.md @@ -112,8 +112,8 @@ graph TD | Path counter | `rl:{ns}:path:{sha256}:reqs` | 60 s | 1 | ~0.3 MB | | UA deny list | `rl:bot:ua:blocked` | permanent SET | 1 | ~0.03 MB | | NS/IP/window counter | `rl:{ns}:ip:{ip}:w:{bucket}` | 120 s | 1 | ~0.6 MB | -| API daily quota (all callers, by IP) | `quota:{ns}:daily:{date}:ip:{ip}` | 24 h | 1 | negligible | -| API weekly quota (all callers, by IP) | `quota:{ns}:weekly:{week}:ip:{ip}` | 7 d | 1 | negligible | +| API daily quota (all callers, by IP) | `quota:{ns}:daily:{date}` (HASH, field=ip) | 24 h | 1 | ~32 MB at 1 200 tenants | +| API weekly quota (all callers, by IP) | `quota:{ns}:weekly:{week}` (HASH, field=ip) | 7 d | 1 | ~158 MB at 1 200 tenants | | API rate counter (ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:reqs` | 60 s | 1 | negligible | | API block marker (ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:blocked` | 60 s | 1 | negligible | | Redis overhead (× 1.5) | | | | ~1.6 GB | @@ -1214,8 +1214,8 @@ Redis PDF caching would solve "high request volume reaching the file layer" — | 1 | Path counter (`/media/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | — (observability only) | `RL_PATH_REQUESTS` | | 1 | Path counter (`/static/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | — | *Future* (requires OpenResty/Lua) | | 1 | UA deny list | `rl:bot:ua:blocked` | permanent SET | — (block on match) | `RL_UA_BLOCKLIST` | -| 1 | API daily quota (all callers, by IP) | `quota:{ns}:daily:{date}:ip:{ip}` | 24 h | 100 000 (`API_QUOTA_DAILY`) | `QUOTA_IP_DAILY` | -| 1 | API weekly quota (all callers, by IP) | `quota:{ns}:weekly:{week}:ip:{ip}` | 7 d | 700 000 (`API_QUOTA_WEEKLY`) | `QUOTA_IP_WEEKLY` | +| 1 | API daily quota (all callers, by IP) | `quota:{ns}:daily:{date}` HASH, field=ip | 24 h | 100 000 (`API_QUOTA_DAILY`) | `QUOTA_DAILY_HASH` | +| 1 | API weekly quota (all callers, by IP) | `quota:{ns}:weekly:{week}` HASH, field=ip | 7 d | 700 000 (`API_QUOTA_WEEKLY`) | `QUOTA_WEEKLY_HASH` | | 1 | API IP rate counter (all callers, ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:reqs` | 60 s (`API_RATE_LIMIT_WINDOW_SECONDS`) | 120 (`API_RATE_LIMIT_THRESHOLD`) | `RL_API_IP_REQUESTS` | | 1 | API IP block marker (ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:blocked` | 60 s (`API_RATE_LIMIT_BLOCK_SECONDS`) | — | `RL_API_IP_BLOCKED` | | 1 | API blocked-IP ZSET index | `rl:index:api_blocked_ips:{0..N-1}` | self-pruning ZSET, score=expiry ts, N=`RATE_LIMITER_INDEX_SHARDS` (default 3) | — | `RL_INDEX_API_BLOCKED_IPS` | diff --git a/sapl/middleware/ratelimit.py b/sapl/middleware/ratelimit.py index 7f2328cfb..ee8abc5a4 100644 --- a/sapl/middleware/ratelimit.py +++ b/sapl/middleware/ratelimit.py @@ -81,14 +81,16 @@ RL_API_IP_BLOCKED = 'rl:api:ns:{ns}:ip:{ip}:blocked' RL_INDEX_API_BLOCKED_IPS = 'rl:index:api_blocked_ips' # --------------------------------------------------------------------------- -# API quota keys — per-consumer, per-day/week, tenant-scoped. -# Consumer identity: authenticated users by uid, anonymous by masked IP. +# API quota keys — per-tenant HASH, one key per period. +# Structure: HASH key = quota:{ns}:daily:{date} field = {ip} value = counter +# HASH key = quota:{ns}:weekly:{week} field = {ip} value = counter # Weekly key uses ISO week notation (yyyy-Www) — unambiguous, Monday-anchored. -# TTL set only on first INCR (Lua); daily=24h, weekly=7d — cleanup only, -# resets are implicit in the date/week embedded in the key name. +# TTL set once on hash creation (Lua TTL guard); resets are implicit in the +# date/week embedded in the key name. Fields are IPs; no per-field TTL. +# Memory: ~63 bytes/field vs ~148 bytes for per-IP STRING keys (~57% saving). # --------------------------------------------------------------------------- -QUOTA_IP_DAILY = 'quota:{ns}:daily:{date}:ip:{ip}' -QUOTA_IP_WEEKLY = 'quota:{ns}:weekly:{week}:ip:{ip}' +QUOTA_DAILY_HASH = 'quota:{ns}:daily:{date}' +QUOTA_WEEKLY_HASH = 'quota:{ns}:weekly:{week}' # --------------------------------------------------------------------------- # Bot UA fragments @@ -115,6 +117,17 @@ _INCR_LUA = """ return n """ +# Atomically increment a HASH field and set TTL on the hash key the first time +# it is created (TTL < 0 covers both -1 = no expire and -2 = key just created). +# KEYS[1] = hash key ARGV[1] = field (IP) ARGV[2] = TTL seconds +_HINCRBY_LUA = """ + local n = redis.call('HINCRBY', KEYS[1], ARGV[1], 1) + if redis.call('TTL', KEYS[1]) < 0 then + redis.call('EXPIRE', KEYS[1], ARGV[2]) + end + return n +""" + # Atomically write a block key and record it in the sharded ZSET index. # Prunes expired entries from the target shard before inserting the new one, # keeping each shard bounded to only active blocks (amortised O(1) cleanup). @@ -289,6 +302,26 @@ def _incr_with_ttl(key, ttl): return count +def _hincrby_with_ttl(hash_key, field, ttl): + """ + Atomic HINCRBY + conditional EXPIRE via Redis Lua script (ratelimit cache, DB 1). + + Increments the counter for `field` inside `hash_key` and sets a TTL on the + hash key if it does not already have one (i.e. on first creation). + Falls back to a composite STRING key `hash_key:field` when Redis is unavailable. + """ + try: + from django_redis import get_redis_connection + client = get_redis_connection('ratelimit') + return client.eval(_HINCRBY_LUA, 1, hash_key, field, ttl) + except Exception: + rl_cache = caches['ratelimit'] + fallback_key = f'{hash_key}:{field}' + count = (rl_cache.get(fallback_key) or 0) + 1 + rl_cache.set(fallback_key, count, timeout=ttl) + return count + + def _set_block(block_key, index_key, ttl): """ Atomically set a block key (with TTL) and record it in a sharded ZSET index. @@ -561,6 +594,11 @@ class RateLimitMiddleware: """ Increment daily and weekly API quota counters for all /api/ callers. All callers are keyed by IP — auth status is not checked. + + Keys are HASH structures: one hash per tenant per period, field = IP. + This keeps the global keyspace at O(tenants) instead of O(unique IPs), + reducing Redis memory ~57% vs. per-IP STRING keys at scale. + Fails open (returns None) if Redis/cache is unavailable. """ today = date.today() @@ -569,13 +607,13 @@ class RateLimitMiddleware: week_str = f'{iso[0]}-W{iso[1]:02d}' ip = get_client_ip(request) - d_key = QUOTA_IP_DAILY.format(ns=_NAMESPACE, date=date_str, ip=ip) - w_key = QUOTA_IP_WEEKLY.format(ns=_NAMESPACE, week=week_str, ip=ip) + d_hash = QUOTA_DAILY_HASH.format(ns=_NAMESPACE, date=date_str) + w_hash = QUOTA_WEEKLY_HASH.format(ns=_NAMESPACE, week=week_str) try: - if _incr_with_ttl(d_key, 86400) > self.api_quota_daily: + if _hincrby_with_ttl(d_hash, ip, 86400) > self.api_quota_daily: return 'daily' - if _incr_with_ttl(w_key, 7 * 86400) > self.api_quota_weekly: + if _hincrby_with_ttl(w_hash, ip, 7 * 86400) > self.api_quota_weekly: return 'weekly' except Exception: pass # fail open — quota not enforced when Redis unavailable diff --git a/sapl/middleware/test_ratelimiter.py b/sapl/middleware/test_ratelimiter.py index 31b3ec695..3dd359c43 100644 --- a/sapl/middleware/test_ratelimiter.py +++ b/sapl/middleware/test_ratelimiter.py @@ -13,12 +13,15 @@ from django.test import RequestFactory from sapl.middleware.ratelimit import ( _NAMESPACE, + _hincrby_with_ttl, _index_shard, _is_same_origin, _is_suspicious_headers, _parse_rate, get_client_ip, make_ratelimit_cache_key, + QUOTA_DAILY_HASH, + QUOTA_WEEKLY_HASH, RateLimitMiddleware, RL_API_IP_BLOCKED, RL_API_IP_REQUESTS, @@ -608,19 +611,59 @@ def test_api_block_response_is_json_with_retry_after(): def test_api_auth_user_daily_quota_exceeded_returns_429(): - """Auth users are subject to the same daily quota as anon callers (keyed by pk).""" + """Auth users are subject to the same daily quota as anon callers (keyed by IP).""" mw, _ = _make_middleware() request = _api_req(ip='10.0.0.1') request.user = MagicMock(is_authenticated=True, pk=42) mw.api_quota_daily = 1 - with patch('sapl.middleware.ratelimit._incr_with_ttl', return_value=2): + with patch('sapl.middleware.ratelimit._hincrby_with_ttl', return_value=2): resp = mw(request) assert resp.status_code == 429 assert resp['X-RateLimit-Reason'] == 'quota_daily' +def test_api_weekly_quota_exceeded_returns_429(): + """Weekly quota block fires when daily passes but weekly counter exceeds limit.""" + mw, _ = _make_middleware() + request = _api_req(ip='10.0.0.2') + mw.api_quota_daily = 999999 + mw.api_quota_weekly = 1 + + # daily returns 1 (under limit), weekly returns 2 (over limit) + with patch('sapl.middleware.ratelimit._hincrby_with_ttl', side_effect=[1, 2]): + resp = mw(request) + + assert resp.status_code == 429 + assert resp['X-RateLimit-Reason'] == 'quota_weekly' + + +def test_api_quota_uses_hash_keys(): + """_check_api_quota calls _hincrby_with_ttl with HASH keys (no IP in key name).""" + from datetime import date + mw, _ = _make_middleware() + request = _api_req(ip='10.0.0.3') + mw.api_quota_daily = 999999 + mw.api_quota_weekly = 999999 + + with patch('sapl.middleware.ratelimit._hincrby_with_ttl', return_value=1) as mock_h: + mw._check_api_quota(request) + + today = date.today() + iso = today.isocalendar() + expected_daily_hash = QUOTA_DAILY_HASH.format(ns=_NAMESPACE, date=today.isoformat()) + expected_weekly_hash = QUOTA_WEEKLY_HASH.format( + ns=_NAMESPACE, week=f'{iso[0]}-W{iso[1]:02d}' + ) + calls = mock_h.call_args_list + assert calls[0][0][0] == expected_daily_hash # first arg of first call = hash key + assert calls[1][0][0] == expected_weekly_hash # first arg of second call = hash key + # IP is the field (second positional arg), not embedded in the key + assert '10.0.0.3' not in calls[0][0][0] + assert '10.0.0.3' not in calls[1][0][0] + + def test_non_api_path_uses_global_evaluate_not_api_handler(): mw, _ = _make_middleware() mw._handle_api = MagicMock()