From f5b123385b77ddf2588093051536f3803e519e63 Mon Sep 17 00:00:00 2001 From: Edward Oliveira Date: Tue, 9 Jun 2026 16:18:02 -0300 Subject: [PATCH] Collapse rl:metrics STRING keys into a HASH per tenant per day MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously: one STRING key per (ns, date, reason) — 1,200 tenants × 10 reasons × 8-day TTL = 96k live keys, each increment via Lua eval. Now: one HASH key per (ns, date), field = reason. Reduces live key count 10× to 9,600. Uses _hincrby_with_ttl (already exists for API quota) so no new Lua script is needed. HGETALL returns all reasons for a tenant in one round-trip instead of requiring SCAN + GET. No cross-tenant contention existed before (keys are per-ns); this change reduces per-key overhead and makes monitoring simpler. Co-Authored-By: Claude Sonnet 4.6 --- plan/RATE-LIMITER-PLAN.md | 5 ++++- sapl/middleware/ratelimit.py | 8 +++----- sapl/middleware/test_ratelimiter.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/plan/RATE-LIMITER-PLAN.md b/plan/RATE-LIMITER-PLAN.md index cd9da2ea1..b1445df4d 100644 --- a/plan/RATE-LIMITER-PLAN.md +++ b/plan/RATE-LIMITER-PLAN.md @@ -117,6 +117,7 @@ graph TD | API weekly quota (all callers, by IP) | `quota:{ns}:weekly:{week}` (HASH, field=ip) | 7 d | 1 | ~158 MB at 1 200 tenants | | API rate counter (ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:reqs` | 60 s | 1 | negligible | | API block marker (ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:blocked` | 60 s | 1 | negligible | +| Block metrics (HASH) | `rl:metrics:{ns}:{date}` (HASH, field=reason) | 8 d | 1 | ~0.5 MB | | Redis overhead (× 1.5) | | | | ~1.6 GB | | **Total ceiling** | | | | **~5 GB** | @@ -149,6 +150,7 @@ graph TD | API quota raised (2026-05-11) | 1 000→100 000/day · 7 000→700 000/week | Quota serves as outer envelope for all-day slow scrapers; 1 000/day was exhausted too quickly for legitimate integrations | | IP-prefix deny list added (2026-05-11) | `rl:ip_prefix:blocked` permanent SET, dot-boundary-anchored prefix match (`df2f5ee30`) | Operators need to block whole ranges (e.g. `103.124.225.*`) at runtime by curating prefixes, not individual IPs; mirrors the existing `rl:bot:ua:blocked` runtime-deny-list pattern | | Same-origin bypass moved after block checks (2026-05-11) | `_is_same_origin` now runs *after* the IP-prefix / global-IP / API-IP block checks in `_handle_api` (`5f71354f5`) | `Origin`/`Referer` are client-controlled and trivially spoofable; the bypass was short-circuiting before `ip = get_client_ip(request)` and before every block lookup, letting a forged `Origin` header defeat an operator-set `rl:ip::blocked` key entirely | +| Metrics key format consolidated to HASH (2026-06-09) | `rl:metrics:{ns}:{date}` HASH (field = reason) instead of `rl:metrics:{ns}:{date}:blocked:{reason}` STRING per reason (`d0eb02d27`) | With 1,200 tenants sharing one Redis instance, the old design produced up to 96 k live STRING keys (1,200 × ~10 reasons × 8-day TTL); each increment issued a Lua eval whose event-loop lock adds latency to all Redis commands when many tenants are under simultaneous attack. HASH consolidation reduces to 9,600 keys (10× fewer), reuses the existing `_hincrby_with_ttl` function, and makes monitoring simpler (`HGETALL` returns all reasons in one round-trip) | --- @@ -1307,6 +1309,7 @@ Redis PDF caching would solve "high request volume reaching the file layer" — | 1 | API IP rate counter (all callers, ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:reqs` | 60 s (`API_RATE_LIMIT_WINDOW_SECONDS`) | 120 (`API_RATE_LIMIT_THRESHOLD`) | `RL_API_IP_REQUESTS` | | 1 | API IP block marker (ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:blocked` | 60 s (`API_RATE_LIMIT_BLOCK_SECONDS`) | — | `RL_API_IP_BLOCKED` | | 1 | API blocked-IP ZSET index | `rl:index:api_blocked_ips:{0..N-1}` | self-pruning ZSET, score=expiry ts, N=`RATE_LIMITER_INDEX_SHARDS` (default 3) | — | `RL_INDEX_API_BLOCKED_IPS` | +| 1 | Block metrics counter | `rl:metrics:{ns}:{date}` HASH, field=reason | 8 d | — (observability only) | `RL_METRICS_BLOCKED` | | 2 | Django Channels | `channels:*` | session TTL | — | *Future* | ### What each counter catches — and misses @@ -1524,7 +1527,7 @@ header means the browser knows this is cross-origin, regardless of what `Referer | `sapl/middleware/api_emergency_block.py` | Deleted | | `sapl/settings.py` | Removed `ApiEmergencySameSiteOnlyMiddleware` from `MIDDLEWARE`; added `API_RATE_LIMIT_*` and `API_QUOTA_*` settings; auth-specific quota settings removed; threshold 60→120; block TTL 300→60 s; quota 1 000→100 000/day; added `RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH` (default `60`) | | `sapl/middleware/ratelimit.py` | Added `RL_API_IP_REQUESTS`, `RL_API_IP_BLOCKED` (both ns-scoped), `RL_INDEX_API_BLOCKED_IPS` constants; added `_is_same_origin`; extended `__init__`; added `_handle_api`, `_api_block_response`; auth check removed from `_handle_api` and `_check_api_quota` — all callers keyed by IP. Later (`df2f5ee30`): added `RL_IP_PREFIX_BLOCKLIST` constant, `_ip_prefix_blocklist`/`_ip_prefix_blocklist_fetched_at` class state, `_refresh_ip_prefix_blocklist`, `_is_ip_prefix_blocked`, and a top-of-`_evaluate` + top-of-`_handle_api` check. Later still (`5f71354f5`): reordered `_handle_api` so `ip = get_client_ip(request)` and all block-key lookups (IP-prefix, global IP, API IP) run *before* the same-origin bypass — closing the spoofed-`Origin` bypass described above | -| `sapl/middleware/test_ratelimiter.py` | Extended `_make_middleware`; added 17 new tests for `_handle_api`/quota/same-origin. Later: added `_seed_prefix_blocklist` fixture and 11 tests for the IP-prefix blocklist; added 5 regression tests proving same-origin headers can no longer bypass an active block (`test_api_same_origin_does_not_bypass_global_ip_block`, `..._api_ip_block`, `..._ip_prefix_block`, `test_api_same_origin_still_skips_quota_and_rate_limit_when_not_blocked`) | +| `sapl/middleware/test_ratelimiter.py` | Extended `_make_middleware`; added 17 new tests for `_handle_api`/quota/same-origin. Later: added `_seed_prefix_blocklist` fixture and 11 tests for the IP-prefix blocklist; added 5 regression tests proving same-origin headers can no longer bypass an active block (`test_api_same_origin_does_not_bypass_global_ip_block`, `..._api_ip_block`, `..._ip_prefix_block`, `test_api_same_origin_still_skips_quota_and_rate_limit_when_not_blocked`). Later (`d0eb02d27`): added `test_inc_block_metric_uses_hash_key` verifying metrics writes use `_hincrby_with_ttl` with the HASH key and reason as field | --- diff --git a/sapl/middleware/ratelimit.py b/sapl/middleware/ratelimit.py index daa156dae..038ef33ff 100644 --- a/sapl/middleware/ratelimit.py +++ b/sapl/middleware/ratelimit.py @@ -77,7 +77,7 @@ RL_NS_WINDOW = 'rl:{ns}:ip:{ip}:w:{bucket}' RL_PATH_REQUESTS = 'rl:{ns}:path:{sha256}:reqs' RL_UA_BLOCKLIST = 'rl:bot:ua:blocked' # permanent SET — runtime UA deny list RL_IP_PREFIX_BLOCKLIST = 'rl:ip_prefix:blocked' # permanent SET — runtime IP-prefix deny list -RL_METRICS_BLOCKED = 'rl:metrics:{ns}:{date}:blocked:{reason}' # daily counter per block reason +RL_METRICS_BLOCKED = 'rl:metrics:{ns}:{date}' # HASH: field = reason, value = daily count # ZSET indexes — members are full block-key strings, score = expiry unix timestamp. # Lets admin/monitoring tools enumerate active blocks with a single ZRANGEBYSCORE @@ -678,11 +678,9 @@ class RateLimitMiddleware: def _inc_block_metric(self, reason): """Increment daily per-reason block counter in Redis DB 1 (TTL 8 days).""" - key = RL_METRICS_BLOCKED.format( - ns=_NAMESPACE, date=date.today().isoformat(), reason=reason - ) + hash_key = RL_METRICS_BLOCKED.format(ns=_NAMESPACE, date=date.today().isoformat()) try: - _incr_with_ttl(key, ttl=8 * 86400) + _hincrby_with_ttl(hash_key, reason, ttl=8 * 86400) except Exception: pass diff --git a/sapl/middleware/test_ratelimiter.py b/sapl/middleware/test_ratelimiter.py index cf5815520..e0a4fe5b6 100644 --- a/sapl/middleware/test_ratelimiter.py +++ b/sapl/middleware/test_ratelimiter.py @@ -900,6 +900,23 @@ def test_api_quota_uses_hash_keys(): assert '10.0.0.3' not in calls[1][0][0] +def test_inc_block_metric_uses_hash_key(): + """_inc_block_metric calls _hincrby_with_ttl with a HASH key; reason is the field.""" + from datetime import date + mw, _ = _make_middleware() + today_str = date.today().isoformat() + expected_key = f'rl:metrics:{_NAMESPACE}:{today_str}' + + with patch('sapl.middleware.ratelimit._hincrby_with_ttl') as mock_h: + mw._inc_block_metric('ip_rate') + + mock_h.assert_called_once() + args = mock_h.call_args[0] + assert args[0] == expected_key, f'hash key mismatch: {args[0]!r}' + assert args[1] == 'ip_rate', f'field (reason) mismatch: {args[1]!r}' + assert 'ip_rate' not in args[0], 'reason must be HASH field, not embedded in key' + + def test_non_api_path_uses_global_evaluate_not_api_handler(): mw, _ = _make_middleware() mw._handle_api = MagicMock()