From 6b6bc810cb94de95ab0910e2ec1df4193121ac94 Mon Sep 17 00:00:00 2001 From: Edward Oliveira Date: Mon, 8 Jun 2026 16:06:00 -0300 Subject: [PATCH] Return 403 instead of 429 for IP-prefix blocklist hits; document feature A permanent operator-curated deny decision (rl:ip_prefix:blocked) is not a transient rate limit, so it should surface as 403 Forbidden with no Retry-After rather than 429. Also brings RATE-LIMITER-PLAN.md up to date with the IP-prefix blocklist feature and the same-origin bypass fix, including redis-cli commands to populate/remove/list the deny-list set. Co-Authored-By: Claude Sonnet 4.6 --- plan/RATE-LIMITER-PLAN.md | 137 ++++++++++++++++++++++++++-- sapl/middleware/ratelimit.py | 16 +++- sapl/middleware/test_ratelimiter.py | 14 ++- 3 files changed, 151 insertions(+), 16 deletions(-) diff --git a/plan/RATE-LIMITER-PLAN.md b/plan/RATE-LIMITER-PLAN.md index 396f83595..cd9da2ea1 100644 --- a/plan/RATE-LIMITER-PLAN.md +++ b/plan/RATE-LIMITER-PLAN.md @@ -111,6 +111,7 @@ graph TD | Blocked-user index | `rl:index:blocked_users` | permanent ZSET | 1 | negligible | | Path counter | `rl:{ns}:path:{sha256}:reqs` | 60 s | 1 | ~0.3 MB | | UA deny list | `rl:bot:ua:blocked` | permanent SET | 1 | ~0.03 MB | +| IP-prefix deny list | `rl:ip_prefix:blocked` | permanent SET | 1 | ~0.01 MB | | NS/IP/window counter | `rl:{ns}:ip:{ip}:w:{bucket}` | 120 s | 1 | ~0.6 MB | | API daily quota (all callers, by IP) | `quota:{ns}:daily:{date}` (HASH, field=ip) | 24 h | 1 | ~32 MB at 1 200 tenants | | API weekly quota (all callers, by IP) | `quota:{ns}:weekly:{week}` (HASH, field=ip) | 7 d | 1 | ~158 MB at 1 200 tenants | @@ -146,6 +147,8 @@ graph TD | API threshold raised (2026-05-11) | 60→120 req/min | Aligns with legitimate integration patterns; slow-drip abuse is caught by the daily quota | | API block TTL reduced (2026-05-11) | 300→60 s | Shorter cooldown reduces false-positive lockout duration for shared IPs | | API quota raised (2026-05-11) | 1 000→100 000/day · 7 000→700 000/week | Quota serves as outer envelope for all-day slow scrapers; 1 000/day was exhausted too quickly for legitimate integrations | +| IP-prefix deny list added (2026-05-11) | `rl:ip_prefix:blocked` permanent SET, dot-boundary-anchored prefix match (`df2f5ee30`) | Operators need to block whole ranges (e.g. `103.124.225.*`) at runtime by curating prefixes, not individual IPs; mirrors the existing `rl:bot:ua:blocked` runtime-deny-list pattern | +| Same-origin bypass moved after block checks (2026-05-11) | `_is_same_origin` now runs *after* the IP-prefix / global-IP / API-IP block checks in `_handle_api` (`5f71354f5`) | `Origin`/`Referer` are client-controlled and trivially spoofable; the bypass was short-circuiting before `ip = get_client_ip(request)` and before every block lookup, letting a forged `Origin` header defeat an operator-set `rl:ip::blocked` key entirely | --- @@ -446,6 +449,82 @@ rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ --- +## Manage the IP-prefix deny list (`rl:ip_prefix:blocked`) + +`rl:ip_prefix:blocked` (constant `RL_IP_PREFIX_BLOCKLIST`, added in `df2f5ee30`) is a +**permanent Redis SET in DB 1**. Each member is a dotted-decimal **prefix string** +such as `103.124.225` or `45.177.154` — or a complete dotted-quad address such as +`201.23.71.13` to block one specific IP. Any client IP that starts with a stored +prefix on a **dot boundary** is blocked, e.g. `103.124.225` matches `103.124.225.7` +but not `103.124.2250.1` or `103.124.2255` (an octet-aligned, /24-ish range match — +not a raw string prefix). A stored entry that is already a full 4-octet address +(no trailing dot) is matched by equality only. + +This check runs **before everything else** — both at the top of `_evaluate` (non-`/api/` +paths) and at the top of `_handle_api` (`/api/` paths) — and applies universally to +**all traffic**: anonymous, authenticated, `/api/` and regular paths alike. It mirrors +the `rl:bot:ua:blocked` runtime-deny-list pattern: the SET is fetched via `SMEMBERS` +at most once per `RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH` seconds (default `60`) +per worker process, so additions/removals take effect fleet-wide without a code deploy +or restart — only the in-process cache refresh delay. + +### Populate — block a range or a single IP + +```bash +# Block an entire /24-ish range by prefix (matches 103.124.225.0 – 103.124.225.255) +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:ip_prefix:blocked 103.124.225 + +# Block multiple ranges in one call +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:ip_prefix:blocked 103.124.225 45.177.154 + +# Block one specific IP (matched by equality only — no range semantics) +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SADD rl:ip_prefix:blocked 201.23.71.13 +``` + +### Remove — unblock a range or a single IP + +```bash +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SREM rl:ip_prefix:blocked 103.124.225 + +# Remove multiple entries in one call +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SREM rl:ip_prefix:blocked 103.124.225 45.177.154 201.23.71.13 +``` + +### List — inspect the current deny list + +```bash +# All prefixes/addresses currently blocked +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SMEMBERS rl:ip_prefix:blocked + +# Count of entries +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SCARD rl:ip_prefix:blocked + +# Check whether a specific prefix/address is already in the list +rancher kubectl exec -n sapl-redis deploy/sapl-redis -- redis-cli -n 1 \ + SISMEMBER rl:ip_prefix:blocked 103.124.225 +``` + +Via port-forward (local machine): + +```bash +redis-cli -n 1 SADD rl:ip_prefix:blocked 103.124.225 +redis-cli -n 1 SREM rl:ip_prefix:blocked 103.124.225 +redis-cli -n 1 SMEMBERS rl:ip_prefix:blocked +``` + +> **Note**: changes are picked up by each worker within `RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH` +> seconds (default `60`) — there is no immediate fleet-wide invalidation, by design +> (mirrors `rl:bot:ua:blocked`; avoids a Redis round-trip on every request). + +--- + ## Local standalone Redis (development / testing) No Kubernetes? Run Redis directly with Docker: @@ -671,6 +750,10 @@ anonymous/bot traffic. Decision flow inside `RateLimitMiddleware.__call__()` / `_evaluate()`: ``` +-1. IP starts with a prefix in rl:ip_prefix:blocked? → 403 reason=ip_prefix_blocked + (checked first, before everything else; universal — applies to + authenticated users too; 403 + no Retry-After, since this is a + permanent operator-curated deny decision, not a transient rate limit) 0. /api/ path AND consumer daily/weekly quota exceeded? → 429 reason=quota_daily / quota_weekly (per-consumer: auth users by pk, anon by masked IP; fail-open when Redis unavailable) @@ -694,7 +777,11 @@ Decision flow inside `RateLimitMiddleware.__call__()` / `_evaluate()`: ```mermaid flowchart TD - REQ([Request]) --> C0 + REQ([Request]) --> CM1 + + CM1{"IP starts with a prefix in\nrl:ip_prefix:blocked?"} + CM1 -- "yes — operator-curated permanent deny list" --> R_PREFIX([403\nip_prefix_blocked\nno Retry-After]) + CM1 -- no --> C0 C0{"/api/ path AND\ndaily/weekly quota exceeded?"} C0 -- "yes — quota:{ns}:daily/weekly:{period}:user/ip exceeded" --> R_QUOTA([429\nquota_daily / quota_weekly]) @@ -1214,6 +1301,7 @@ Redis PDF caching would solve "high request volume reaching the file layer" — | 1 | Path counter (`/media/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | — (observability only) | `RL_PATH_REQUESTS` | | 1 | Path counter (`/static/`) | `rl:{ns}:path:{sha256}:reqs` | 60 s | — | *Future* (requires OpenResty/Lua) | | 1 | UA deny list | `rl:bot:ua:blocked` | permanent SET | — (block on match) | `RL_UA_BLOCKLIST` | +| 1 | IP-prefix deny list | `rl:ip_prefix:blocked` | permanent SET | — (block on prefix match) | `RL_IP_PREFIX_BLOCKLIST` | | 1 | API daily quota (all callers, by IP) | `quota:{ns}:daily:{date}` HASH, field=ip | 24 h | 100 000 (`API_QUOTA_DAILY`) | `QUOTA_DAILY_HASH` | | 1 | API weekly quota (all callers, by IP) | `quota:{ns}:weekly:{week}` HASH, field=ip | 7 d | 700 000 (`API_QUOTA_WEEKLY`) | `QUOTA_WEEKLY_HASH` | | 1 | API IP rate counter (all callers, ns-scoped) | `rl:api:ns:{ns}:ip:{ip}:reqs` | 60 s (`API_RATE_LIMIT_WINDOW_SECONDS`) | 120 (`API_RATE_LIMIT_THRESHOLD`) | `RL_API_IP_REQUESTS` | @@ -1357,16 +1445,18 @@ insufficient: ### Solution — `_handle_api` -`RateLimitMiddleware.__call__` delegates all `/api/` requests to `_handle_api`, which applies a separate, scoped decision chain: +`RateLimitMiddleware.__call__` delegates all `/api/` requests to `_handle_api`, which applies a separate, scoped decision chain (current order, **post-`5f71354f5` fix** — see §Security fix below for why block checks now precede the same-origin bypass): | Step | Condition | Action | |------|-----------|--------| | 1 | `OPTIONS` method | Pass — CORS preflight must never be blocked | -| 2 | Same-origin (`_is_same_origin`) | Pass — SAPL's own browser polling; no counter | -| 3 | `rl:ip::blocked` exists | 429 `global_ip_blocked` — global block also covers `/api/` | -| 4 | `rl:api:ns::ip::blocked` exists | 429 `api_ip_blocked` — API-only, tenant-scoped block | -| 5 | Daily/weekly quota exceeded | 429 `quota_daily` / `quota_weekly` | -| 6 | API counter ≥ threshold (all callers) | Write `rl:api:ns::ip::blocked`; 429 `api_threshold_exceeded` | +| 2 | `ip = get_client_ip(request)` | (no action — IP resolved up front so every check below can use it) | +| 3 | IP-prefix block (`_is_ip_prefix_blocked`) | **403** `ip_prefix_blocked` — operator-curated range block; applies to everyone. 403 (not 429) and no `Retry-After`: this is a permanent deny decision, not a transient rate limit | +| 4 | `rl:ip::blocked` exists | 429 `global_ip_blocked` — global block also covers `/api/` | +| 5 | `rl:api:ns::ip::blocked` exists | 429 `api_ip_blocked` — API-only, tenant-scoped block | +| 6 | Same-origin (`_is_same_origin`) | Pass, skipping steps 7-8 only — SAPL's own browser polling is exempt from quota/rate-limit *accounting*, never from the block checks above | +| 7 | Daily/weekly quota exceeded | 429 `quota_daily` / `quota_weekly` | +| 8 | API counter ≥ threshold (all callers) | Write `rl:api:ns::ip::blocked`; 429 `api_threshold_exceeded` | | — | Under threshold | Pass | Auth status is **not checked**. Authenticated and anonymous callers are treated identically — both keyed by IP, both subject to the same threshold and quota. `_evaluate` (240/min per-user) still governs all non-`/api/` paths. @@ -1374,6 +1464,33 @@ Auth status is **not checked**. Authenticated and anonymous callers are treated **Key invariant**: `rl:ip::blocked` is **never written** because of `/api/` abuse. `rl:api:ns::ip::blocked` is tenant-scoped and blocks only `/api/` — page requests from the same NAT continue, and a block in one k8s namespace does not affect other tenants. +### Security fix — same-origin bypass let blocked IPs through (`5f71354f5`, 2026-05-11) + +**Symptom**: an operator set `rl:ip:201.23.71.13:blocked` (the global `RL_IP_BLOCKED` key) with a large TTL, expecting that IP to be blocked everywhere — per the middleware's own documented decision flow, "global IP block also covers `/api/`". Requests from that IP to `/api/` endpoints kept getting through. + +**Root cause**: in the original `_handle_api`, the same-origin check ran — and could `return` — *before* `ip = get_client_ip(request)` and before any of the block-key lookups: + +```python +def _handle_api(self, request): + if request.method == 'OPTIONS': + return self.get_response(request) + if self.api_same_origin_bypass and _is_same_origin(request): + return self.get_response(request) # <-- returned HERE, nothing below ever ran + ip = get_client_ip(request) + # ...IP-prefix block, global IP block, API block, quota, rate limit +``` + +`_is_same_origin` decides "same origin" purely by parsing the client-supplied `Origin`/`Referer` headers and comparing their host to `request.get_host()` — **both headers are entirely client-controlled**, with no CSRF-token or session/cookie verification backing them. Any bot can send `Origin: https://` and be treated as "SAPL's own browser polling," getting a complete free pass on the IP-prefix blocklist, the global IP block, the API-specific IP block, daily/weekly quota, and the per-minute API rate limit. `201.23.71.13` was simply sending requests with a spoofed `Origin`/`Referer` matching the SAPL host. + +**Fix**: reorder `_handle_api` so block checks — decisions already made by the system or an operator about an IP — always run first and can never be bypassed by spoofable headers. The same-origin bypass now sits *after* the IP-prefix, global-IP, and API-IP block checks (step 6 above) and exempts a request from quota/rate-limit *accounting* only, never from an active block. See the updated decision-chain table above for the full new order. + +```bash +# Manual verification (against local Redis, ratelimit cache / DB 1): +redis-cli -n 1 SET rl:ip:201.23.71.13:blocked 1 EX 3600 +curl -H 'X-Forwarded-For: 201.23.71.13' -H 'Origin: https://' http://localhost:8000/api/materia/ +# Before the fix: 200 (bypassed). After the fix: 429 with X-RateLimit-Reason: global_ip_blocked +``` + ### Same-origin detection — `_is_same_origin` Replaces `ApiEmergencySameSiteOnlyMiddleware._came_from_same_host` (deleted). @@ -1405,9 +1522,9 @@ header means the browser knows this is cross-origin, regardless of what `Referer | File | Change | |------|--------| | `sapl/middleware/api_emergency_block.py` | Deleted | -| `sapl/settings.py` | Removed `ApiEmergencySameSiteOnlyMiddleware` from `MIDDLEWARE`; added `API_RATE_LIMIT_*` and `API_QUOTA_*` settings; auth-specific quota settings removed; threshold 60→120; block TTL 300→60 s; quota 1 000→100 000/day | -| `sapl/middleware/ratelimit.py` | Added `RL_API_IP_REQUESTS`, `RL_API_IP_BLOCKED` (both ns-scoped), `RL_INDEX_API_BLOCKED_IPS` constants; added `_is_same_origin`; extended `__init__`; added `_handle_api`, `_api_block_response`; auth check removed from `_handle_api` and `_check_api_quota` — all callers keyed by IP | -| `sapl/middleware/test_ratelimiter.py` | Extended `_make_middleware`; added 17 new tests | +| `sapl/settings.py` | Removed `ApiEmergencySameSiteOnlyMiddleware` from `MIDDLEWARE`; added `API_RATE_LIMIT_*` and `API_QUOTA_*` settings; auth-specific quota settings removed; threshold 60→120; block TTL 300→60 s; quota 1 000→100 000/day; added `RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH` (default `60`) | +| `sapl/middleware/ratelimit.py` | Added `RL_API_IP_REQUESTS`, `RL_API_IP_BLOCKED` (both ns-scoped), `RL_INDEX_API_BLOCKED_IPS` constants; added `_is_same_origin`; extended `__init__`; added `_handle_api`, `_api_block_response`; auth check removed from `_handle_api` and `_check_api_quota` — all callers keyed by IP. Later (`df2f5ee30`): added `RL_IP_PREFIX_BLOCKLIST` constant, `_ip_prefix_blocklist`/`_ip_prefix_blocklist_fetched_at` class state, `_refresh_ip_prefix_blocklist`, `_is_ip_prefix_blocked`, and a top-of-`_evaluate` + top-of-`_handle_api` check. Later still (`5f71354f5`): reordered `_handle_api` so `ip = get_client_ip(request)` and all block-key lookups (IP-prefix, global IP, API IP) run *before* the same-origin bypass — closing the spoofed-`Origin` bypass described above | +| `sapl/middleware/test_ratelimiter.py` | Extended `_make_middleware`; added 17 new tests for `_handle_api`/quota/same-origin. Later: added `_seed_prefix_blocklist` fixture and 11 tests for the IP-prefix blocklist; added 5 regression tests proving same-origin headers can no longer bypass an active block (`test_api_same_origin_does_not_bypass_global_ip_block`, `..._api_ip_block`, `..._ip_prefix_block`, `test_api_same_origin_still_skips_quota_and_rate_limit_when_not_blocked`) | --- diff --git a/sapl/middleware/ratelimit.py b/sapl/middleware/ratelimit.py index 0aafbfe3d..0d6885231 100644 --- a/sapl/middleware/ratelimit.py +++ b/sapl/middleware/ratelimit.py @@ -3,11 +3,12 @@ RateLimitMiddleware — cross-pod rate limiting backed by shared Redis. Decision flow (per request): Both /api/ and non-/api/ paths — checked first, before all else: - -1. IP in rl:ip_prefix:blocked (prefix match)? → 429 + -1. IP in rl:ip_prefix:blocked (prefix match)? → 403 (Forbidden — not a rate + limit; an operator-curated permanent deny list, no Retry-After) (universal — applies to authenticated users too, like the UA bot checks) /api/ paths — handled by _handle_api: 0a. OPTIONS? → pass (CORS preflight must never be blocked) - 0b. rl:ip_prefix:blocked? → 429 (prefix match; see -1 above) + 0b. rl:ip_prefix:blocked? → 403 (prefix match; see -1 above) 0c. rl:ip::blocked? → 429 (global block also covers /api/) 0d. rl:api:ip::blocked? → 429 (API-only block) Block checks 0b-0d always run before the same-origin check below — @@ -415,8 +416,11 @@ class RateLimitMiddleware: extra={'ua': request.META.get('HTTP_USER_AGENT', '')}, ) self._inc_block_metric(decision['reason']) - response = HttpResponse(status=429) - response['Retry-After'] = self.BLOCK_TTL + if decision['reason'] == 'ip_prefix_blocked': + response = HttpResponse(status=403) + else: + response = HttpResponse(status=429) + response['Retry-After'] = self.BLOCK_TTL response['X-RateLimit-Reason'] = decision['reason'] return response logger.debug( @@ -437,6 +441,10 @@ class RateLimitMiddleware: def _api_block_response(self, reason, retry_after=None): from django.http import JsonResponse + if reason == 'ip_prefix_blocked': + resp = JsonResponse({'detail': 'Forbidden.'}, status=403) + resp['X-RateLimit-Reason'] = reason + return resp if retry_after is None: retry_after = self.api_block_seconds resp = JsonResponse( diff --git a/sapl/middleware/test_ratelimiter.py b/sapl/middleware/test_ratelimiter.py index 3fb585a7a..e0898f7ec 100644 --- a/sapl/middleware/test_ratelimiter.py +++ b/sapl/middleware/test_ratelimiter.py @@ -531,6 +531,16 @@ def test_call_block_returns_429_with_retry_after_header(): mw.get_response.assert_not_called() +def test_call_ip_prefix_block_returns_403_without_retry_after_header(): + mw, _ = _make_middleware() + mw._evaluate = MagicMock(return_value={'action': 'block', 'reason': 'ip_prefix_blocked', 'ip': '1.2.3.4'}) + response = mw(_factory.get('/')) + assert response.status_code == 403 + assert 'Retry-After' not in response + assert response['X-RateLimit-Reason'] == 'ip_prefix_blocked' + mw.get_response.assert_not_called() + + def test_call_pass_forwards_request_to_get_response(): mw, _ = _make_middleware() mw._evaluate = MagicMock(return_value={'action': 'pass', 'ip': '1.2.3.4'}) @@ -698,7 +708,7 @@ def test_api_same_origin_does_not_bypass_ip_prefix_block(_seed_prefix_blocklist) response = mw(request) mw.get_response.assert_not_called() - assert response.status_code == 429 + assert response.status_code == 403 assert response['X-RateLimit-Reason'] == 'ip_prefix_blocked' @@ -733,7 +743,7 @@ def test_api_blocks_on_ip_prefix_before_quota_checks(_seed_prefix_blocklist): mw.get_response.assert_not_called() mw._check_api_quota.assert_not_called() mw._incr_with_ttl.assert_not_called() - assert response.status_code == 429 + assert response.status_code == 403 assert response['X-RateLimit-Reason'] == 'ip_prefix_blocked'