diff --git a/docker/Dockerfile b/docker/Dockerfile index 8cd613d77..72523774d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -57,7 +57,14 @@ RUN set -eux; \ if [ "$WITH_GRAPHVIZ" = "1" ]; then apt-get install -y --no-install-recommends graphviz; fi; \ if [ "$WITH_POPPLER" = "1" ]; then apt-get install -y --no-install-recommends poppler-utils; fi; \ if [ "$WITH_PSQL_CLIENT" = "1" ]; then apt-get install -y --no-install-recommends postgresql-client; fi; \ - if [ "$WITH_NGINX" = "1" ]; then apt-get install -y --no-install-recommends nginx libnginx-mod-http-geoip2 libmaxminddb0; fi; \ + if [ "$WITH_NGINX" = "1" ]; then \ + curl -fsSL https://openresty.org/package/pubkey.gpg | gpg --dearmor -o /usr/share/keyrings/openresty.gpg; \ + echo "deb [signed-by=/usr/share/keyrings/openresty.gpg] http://openresty.org/package/debian bookworm openresty" \ + > /etc/apt/sources.list.d/openresty.list; \ + apt-get update; \ + apt-get install -y --no-install-recommends openresty libmaxminddb0; \ + opm get anjia0532/lua-resty-maxminddb; \ + fi; \ rm -rf /var/lib/apt/lists/* # Usuários/grupos (idempotente) @@ -73,7 +80,6 @@ RUN mkdir -p \ /var/interlegis/sapl/media \ /var/interlegis/sapl/run \ /var/interlegis/sapl/tmp \ - /etc/nginx/geoip \ && chown -R root:nginx /var/interlegis/sapl /var/interlegis/sapl/run \ && chmod -R g+rwX /var/interlegis/sapl \ && chmod 2775 /var/interlegis/sapl /var/interlegis/sapl/run \ @@ -87,18 +93,21 @@ COPY --from=builder ${VENV_DIR} ${VENV_DIR} # Código da aplicação (depois do venv para aproveitar cache) COPY . /var/interlegis/sapl/ -# Nginx config + GeoLite2-ASN database (somente se instalado). +# OpenResty config + GeoLite2-ASN database (somente se instalado). # # GeoLite2-ASN.mmdb is NOT downloaded at build time. # Run docker/geoip/update_geoip.sh before each build to refresh it. # The .mmdb file lives at docker/geoip/GeoLite2-ASN.mmdb (git-ignored binary). # If the file is absent the build FAILS — run update_geoip.sh first. RUN if [ "$WITH_NGINX" = "1" ]; then \ - rm -f /etc/nginx/conf.d/*; \ - cp docker/config/nginx/sapl.conf /etc/nginx/conf.d/sapl.conf.template; \ - cp docker/config/nginx/nginx.conf /etc/nginx/nginx.conf; \ + OR_CONF=/usr/local/openresty/nginx/conf; \ + mkdir -p $OR_CONF/conf.d $OR_CONF/geoip; \ + rm -f $OR_CONF/conf.d/*; \ + cp docker/config/nginx/sapl.conf $OR_CONF/conf.d/sapl.conf.template; \ + cp docker/config/nginx/nginx.conf $OR_CONF/nginx.conf; \ + cp docker/config/nginx/blocklist.lua $OR_CONF/blocklist.lua; \ if [ -f "docker/geoip/GeoLite2-ASN.mmdb" ]; then \ - cp docker/geoip/GeoLite2-ASN.mmdb /etc/nginx/geoip/GeoLite2-ASN.mmdb; \ + cp docker/geoip/GeoLite2-ASN.mmdb $OR_CONF/geoip/GeoLite2-ASN.mmdb; \ echo "[geoip] GeoLite2-ASN.mmdb installed."; \ else \ echo "[geoip] ERROR: docker/geoip/GeoLite2-ASN.mmdb not found."; \ @@ -119,10 +128,11 @@ RUN install -m 755 docker/startup_scripts/start.sh /var/interlegis # (Se possível, evite copiar .env no build. Use secrets/variáveis em runtime.) COPY docker/config/env_dockerfile /var/interlegis/sapl/sapl/.env -# Logs (só se nginx estiver presente) +# Logs (só se OpenResty estiver presente) RUN if [ "$WITH_NGINX" = "1" ]; then \ - ln -sf /dev/stdout /var/log/nginx/access.log; \ - ln -sf /dev/stderr /var/log/nginx/error.log; \ + mkdir -p /var/log/openresty; \ + ln -sf /dev/stdout /var/log/openresty/access.log; \ + ln -sf /dev/stderr /var/log/openresty/error.log; \ fi \ && mkdir -p /var/log/sapl/ \ && ln -sf /var/interlegis/sapl/sapl.log /var/log/sapl/sapl.log diff --git a/docker/config/nginx/blocklist.lua b/docker/config/nginx/blocklist.lua new file mode 100644 index 000000000..19e390e23 --- /dev/null +++ b/docker/config/nginx/blocklist.lua @@ -0,0 +1,85 @@ +-- blocklist.lua: early-reject blocked IPs before reaching Gunicorn. +-- +-- Checks (in order, cheapest first): +-- 1. User-Agent in bot UA list — nginx map variable, no Redis +-- 2. ASN in datacenter deny list — lua-resty-maxminddb (MaxMind ASN DB) +-- 3. ngx.shared.ip_prefix_blocked membership — in-process cache refreshed every 60s +-- 4. GET rl:ip:{ip}:blocked — global IP block (Redis DB 1) +-- 5. GET rl:api:ns:{ns}:ip:{ip}:blocked — per-tenant API block (/api/ only, Redis DB 1) +-- +-- Checks 4+5 are pipelined in one Redis round trip. +-- On Redis failure: fail-open (request passes to Django). + +-- Parse REDIS_URL (redis://host:port or redis://host:port/db). +local redis_url = os.getenv("REDIS_URL") or "redis://127.0.0.1:6379" +local REDIS_HOST, port_str = redis_url:match("redis://([^:/]+):(%d+)") +if not REDIS_HOST then REDIS_HOST = redis_url:match("redis://([^:/]+)") or "127.0.0.1" end +local REDIS_PORT = tonumber(port_str) or 6379 + +local POD_NS = os.getenv("POD_NAMESPACE") or "" +local ip = ngx.var.remote_addr +local is_api = ngx.var.uri:sub(1, 5) == "/api/" + +local function return_429() + ngx.status = 429 + ngx.header["Retry-After"] = "300" + ngx.header["Content-Type"] = "application/json" + ngx.say('{"detail":"Too Many Requests"}') + return ngx.exit(429) +end + +-- 1. Bot UA check (nginx map variable — no I/O). +if ngx.var.bot_ua_blocked == "1" then return return_429() end + +-- 2. ASN check via lua-resty-maxminddb (shared DB handle opened in init_by_lua_block). +local BLOCKED_ASNS = { + [16509] = true, -- Amazon AWS + [14618] = true, -- Amazon AWS us-east + [8075] = true, -- Microsoft Azure + [396982]= true, -- Google Cloud + [20473] = true, -- Vultr + [24940] = true, -- Hetzner + [16276] = true, -- OVH + [36352] = true, -- ColoCrossing + [63949] = true, -- Linode / Akamai +} +local ok_mmdb, mmdb = pcall(require, "resty.maxminddb") +if ok_mmdb and mmdb.initted() then + local result = mmdb.lookup(ip) + if result and BLOCKED_ASNS[result.autonomous_system_number] then + return return_429() + end +end + +-- Build 4 candidates for prefix check: three trailing-dot prefixes + exact IP. +-- Mirrors Django's _is_ip_prefix_blocked normalisation and _refresh_ip_prefix_blocklist. +local parts = {} +for p in ip:gmatch("[^.]+") do parts[#parts+1] = p end +local p1 = parts[1] .. "." +local p2 = parts[1] .. "." .. parts[2] .. "." +local p3 = parts[1] .. "." .. parts[2] .. "." .. parts[3] .. "." + +-- 3. IP prefix check (in-process shared dict — no Redis I/O per request). +local dict = ngx.shared.ip_prefix_blocked +if dict:get(p1) or dict:get(p2) or dict:get(p3) or dict:get(ip) then + return return_429() +end + +-- 4+5. Pipeline both STRING block checks in one Redis round trip. +local red = require("resty.redis"):new() +red:set_timeout(200) +local ok = red:connect(REDIS_HOST, REDIS_PORT) +if not ok then return end -- fail-open + +red:select(1) + +red:init_pipeline() +red:get("rl:ip:" .. ip .. ":blocked") +red:get("rl:api:ns:" .. POD_NS .. ":ip:" .. ip .. ":blocked") +local res = red:commit_pipeline() +red:set_keepalive(10000, 1) + +if not res then return end -- fail-open on pipeline error + +if res[1] == "1" then return return_429() end +if is_api and res[2] == "1" then return return_429() end diff --git a/docker/config/nginx/nginx.conf b/docker/config/nginx/nginx.conf index 62d822f5c..c0f8f70f4 100644 --- a/docker/config/nginx/nginx.conf +++ b/docker/config/nginx/nginx.conf @@ -1,10 +1,15 @@ -load_module modules/ngx_http_geoip2_module.so; +# OpenResty configuration — replaces the previous nginx + libnginx-mod-http-geoip2 stack. +# ASN-based blocking moved to blocklist.lua using lua-resty-maxminddb (pure Lua, no C module). -user www-data nginx; +# Make POD_NAMESPACE and Redis URL available to Lua. +env POD_NAMESPACE; +env REDIS_URL; + +user www-data; worker_processes 1; -error_log /var/log/nginx/error.log warn; -pid /var/run/nginx.pid; +error_log /var/log/openresty/error.log warn; +pid /var/run/openresty.pid; events { @@ -13,7 +18,7 @@ events { http { - include /etc/nginx/mime.types; + include /usr/local/openresty/nginx/conf/mime.types; default_type application/octet-stream; # ---------------------------------------------------------------- @@ -31,7 +36,7 @@ http { '"$http_user_agent" "$http_x_forwarded_for" ' 'rt=$request_time'; - access_log /var/log/nginx/access.log main; + access_log /var/log/openresty/access.log main; # ---------------------------------------------------------------- # FIX: kernel bypass — was off (disables zero-copy file serving) @@ -65,30 +70,7 @@ http { limit_req_zone $binary_remote_addr zone=sapl_heavy:10m rate=10r/m; # ---------------------------------------------------------------- - # ASN-Based Blocking (datacenter / scraper ASNs). - # Requires libnginx-mod-http-geoip2 and GeoLite2-ASN.mmdb. - # See rate-limiter-v2.md Phase 0 §3.4 for install instructions. - # ---------------------------------------------------------------- - geoip2 /etc/nginx/geoip/GeoLite2-ASN.mmdb { - $geoip2_asn_number autonomous_system_number; - $geoip2_asn_org autonomous_system_organization; - } - - map $geoip2_asn_number $bot_asn { - default 0; - 16509 1; # Amazon AWS - 14618 1; # Amazon AWS us-east - 8075 1; # Microsoft Azure - 396982 1; # Google Cloud - 20473 1; # Vultr - 24940 1; # Hetzner - 16276 1; # OVH - 36352 1; # ColoCrossing - 63949 1; # Linode / Akamai - } - - # ---------------------------------------------------------------- - # Bot blocking by User-Agent. + # Bot blocking by User-Agent (nginx map — no module required). # Chrome/98.0.4758 is a confirmed scraper (no real user runs a # 2022 browser version in 2026). Googlebot excluded for SEO. # ---------------------------------------------------------------- @@ -108,6 +90,65 @@ http { "~*Chrome/98\.0\.4758" 1; } + # ---------------------------------------------------------------- + # OpenResty: open MaxMind ASN DB once in master (workers inherit fd). + # ASN-based blocking runs in blocklist.lua via lua-resty-maxminddb. + # ---------------------------------------------------------------- + init_by_lua_block { + local ok, mmdb = pcall(require, "resty.maxminddb") + if ok then + local db_path = "/usr/local/openresty/nginx/conf/geoip/GeoLite2-ASN.mmdb" + pcall(function() mmdb.init(db_path) end) + end + } + + # ---------------------------------------------------------------- + # OpenResty: shared dict for IP-prefix deny list (refreshed every 60s). + # 1 MB holds ~10,000 prefix entries with overhead to spare. + # ---------------------------------------------------------------- + lua_shared_dict ip_prefix_blocked 1m; + + # ---------------------------------------------------------------- + # OpenResty: background timer populates ip_prefix_blocked from Redis. + # Runs once per worker process at startup, then every 60s. + # ---------------------------------------------------------------- + init_worker_by_lua_block { + -- Parse REDIS_URL (redis://host:port or redis://host:port/db). + local url = os.getenv("REDIS_URL") or "redis://127.0.0.1:6379" + local REDIS_HOST, port_str = url:match("redis://([^:/]+):(%d+)") + if not REDIS_HOST then REDIS_HOST = url:match("redis://([^:/]+)") or "127.0.0.1" end + local REDIS_PORT = tonumber(port_str) or 6379 + local INTERVAL = 60 + + local function refresh(premature) + if premature then return end + local ok, red = pcall(function() + local r = require("resty.redis"):new() + r:set_timeout(500) + assert(r:connect(REDIS_HOST, REDIS_PORT)) + r:select(1) + return r + end) + if ok then + local members = red:smembers("rl:ip_prefix:blocked") + red:set_keepalive(10000, 1) + if members and type(members) == "table" then + local dict = ngx.shared.ip_prefix_blocked + dict:flush_all() + for _, m in ipairs(members) do + -- Normalise: strip trailing dot, re-add unless it's a full dotted-quad. + local stripped = m:gsub("%.$", "") + local key = (select(2, stripped:gsub("%.", "")) < 3) + and (stripped .. ".") or stripped + dict:set(key, 1) + end + end + end + ngx.timer.at(INTERVAL, refresh) + end + ngx.timer.at(0, refresh) + } + gzip on; gzip_disable "MSIE [1-6]\\.(?!.*SV1)"; gzip_proxied any; @@ -115,5 +156,5 @@ http { gzip_types text/plain text/css text/javascript application/javascript application/x-javascript text/xml application/xml application/rss+xml image/gif image/png image/x-icon image/jpeg image/svg+xml; gzip_vary on; - include /etc/nginx/conf.d/*.conf; + include /usr/local/openresty/nginx/conf/conf.d/*.conf; } diff --git a/docker/config/nginx/sapl.conf b/docker/config/nginx/sapl.conf index ea9e955da..bae13ad00 100644 --- a/docker/config/nginx/sapl.conf +++ b/docker/config/nginx/sapl.conf @@ -17,18 +17,11 @@ server { client_max_body_size 4G; # ---------------------------------------------------------------- - # Block known scraper ASNs (datacenter traffic) — zero Python cost. + # OpenResty Lua blocklist: ASN block, prefix block, and Redis-backed + # IP/API blocks — all evaluated before reaching Gunicorn. + # UA block is also enforced here via $bot_ua_blocked map variable. # ---------------------------------------------------------------- - if ($bot_asn = 1) { - return 429 "Too Many Requests"; - } - - # ---------------------------------------------------------------- - # Block known bots by User-Agent — zero Python cost. - # ---------------------------------------------------------------- - if ($bot_ua_blocked = 1) { - return 429 "Too Many Requests"; - } + access_by_lua_file /usr/local/openresty/nginx/conf/blocklist.lua; # ---------------------------------------------------------------- # robots.txt served directly by nginx. diff --git a/docker/startup_scripts/start.sh b/docker/startup_scripts/start.sh index b2fc83f11..96143cd70 100755 --- a/docker/startup_scripts/start.sh +++ b/docker/startup_scripts/start.sh @@ -345,11 +345,12 @@ start_services() { log "Starting gunicorn..." gunicorn -c gunicorn.conf.py & log "Applying nginx config (burst: general=${NGINX_BURST_GENERAL} media=${NGINX_BURST_MEDIA} api=${NGINX_BURST_API} heavy=${NGINX_BURST_HEAVY})..." + OR_CONF=/usr/local/openresty/nginx/conf envsubst '${NGINX_BURST_GENERAL} ${NGINX_BURST_MEDIA} ${NGINX_BURST_API} ${NGINX_BURST_HEAVY}' \ - < /etc/nginx/conf.d/sapl.conf.template \ - > /etc/nginx/conf.d/sapl.conf - log "Starting nginx..." - exec /usr/sbin/nginx -g "daemon off;" + < $OR_CONF/conf.d/sapl.conf.template \ + > $OR_CONF/conf.d/sapl.conf + log "Starting OpenResty..." + exec /usr/local/openresty/nginx/sbin/nginx -g "daemon off;" } main() { diff --git a/sapl/middleware/ratelimit.py b/sapl/middleware/ratelimit.py index 038ef33ff..5be632cef 100644 --- a/sapl/middleware/ratelimit.py +++ b/sapl/middleware/ratelimit.py @@ -359,8 +359,8 @@ class RateLimitMiddleware: _ua_blocklist_fetched_at: float = 0.0 # In-process cache for the Redis IP-prefix deny list (operator-curated SET - # of dotted-decimal prefixes, e.g. '103.124.225'). Same refresh pattern as - # the UA deny list above, on its own cadence (RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH). + # of dotted-decimal prefixes). Normalized to trailing-dot form on refresh so + # checking is O(1) per candidate via set membership. _ip_prefix_blocklist: set = set() _ip_prefix_blocklist_fetched_at: float = 0.0 @@ -724,49 +724,51 @@ class RateLimitMiddleware: def _refresh_ip_prefix_blocklist(self): """ - Fetch the full IP-prefix deny list from Redis DB 1 (SMEMBERS). - Stores dotted-decimal prefix strings (e.g. '103.124.225') in the - class-level set. Falls back silently — an empty set means no prefix blocks. + Fetch the full IP-prefix deny list from Redis DB 1 (SMEMBERS) and + normalise entries to trailing-dot form so membership checks are O(1). + + Normalisation: strip trailing dot, then re-add if fewer than 3 dots + (i.e. it's a prefix, not a full dotted-quad). Examples: + '103.124.225' → '103.124.225.' + '103.124.225.' → '103.124.225.' + '103.124.225.7'→ '103.124.225.7' (exact IP, no trailing dot) """ try: from django_redis import get_redis_connection client = get_redis_connection('ratelimit') raw = client.smembers(RL_IP_PREFIX_BLOCKLIST) - RateLimitMiddleware._ip_prefix_blocklist = { - m.decode() if isinstance(m, bytes) else m for m in raw - } + normalized = set() + for m in raw: + entry = m.decode() if isinstance(m, bytes) else m + stripped = entry.rstrip('.') + normalized.add(stripped + '.' if stripped.count('.') < 3 else stripped) + RateLimitMiddleware._ip_prefix_blocklist = normalized RateLimitMiddleware._ip_prefix_blocklist_fetched_at = time.time() - logger.debug('[RATELIMIT] ip_prefix_blocklist refreshed entries=%d', len(raw)) + logger.debug('[RATELIMIT] ip_prefix_blocklist refreshed entries=%d', len(normalized)) except Exception as exc: logger.debug('[RATELIMIT] ip_prefix_blocklist refresh skipped: %s', exc) def _is_ip_prefix_blocked(self, ip): """ - Return True if `ip` starts with any prefix in the Redis IP-prefix deny list. - - Matches are anchored on the dot boundary so that a stored prefix like - '103.124.225' matches '103.124.225.7' but not '103.124.2250.1' or - '103.124.2255' — i.e. the prefix behaves like an octet-aligned /24-ish - network range, not a raw string prefix. - - A stored entry that is already a full dotted-quad address (3 dots and - no trailing dot, e.g. '103.124.225.7' for blocking one specific IP) is - matched by equality only — building a prefix anchor by appending a - trailing dot would be pointless there, since no valid IPv4 address has - a 5th octet. Entries with a trailing dot (e.g. '103.124.225.') are - still treated as prefixes. - Degrades to False when Redis is unavailable. + Return True if `ip` or any of its dot-anchored prefixes is in the + local IP-prefix deny set. + + Generates up to 4 candidates for '203.0.113.42': + '203.', '203.0.', '203.0.113.', '203.0.113.42' + Each lookup is O(1) against the normalised in-process set. + Degrades to False when Redis is unavailable (empty set). """ if time.time() - self._ip_prefix_blocklist_fetched_at > settings.RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH: self._refresh_ip_prefix_blocklist() if not self._ip_prefix_blocklist: return False - for prefix in self._ip_prefix_blocklist: - if ip == prefix: - return True - if prefix.count('.') >= 3 and not prefix.endswith('.'): - continue # full address (no trailing dot) — only exact match makes sense - anchored = prefix if prefix.endswith('.') else prefix + '.' - if ip.startswith(anchored): - return True - return False + parts = ip.split('.') + if len(parts) != 4: + return False + candidates = ( + parts[0] + '.', + parts[0] + '.' + parts[1] + '.', + parts[0] + '.' + parts[1] + '.' + parts[2] + '.', + ip, + ) + return any(c in self._ip_prefix_blocklist for c in candidates) diff --git a/sapl/settings.py b/sapl/settings.py index 512eee0d0..8fcd1be2c 100644 --- a/sapl/settings.py +++ b/sapl/settings.py @@ -243,7 +243,7 @@ def _build_cache_layer(pod_namespace, cache_backend, redis_url): """ if cache_backend == 'redis' and bool(redis_url): _pool = { - 'max_connections': 6, # 1,200 pods × 2 workers × 6 = 14,400 peak + 'max_connections': 3, # 1,200 pods × 2 workers × 3 = 7,200 peak (headroom for nginx connections) 'socket_timeout': 0.5, 'socket_connect_timeout': 0.5, } @@ -413,9 +413,7 @@ RATE_LIMITER_RATE_BOT = config('RATE_LIMITER_RATE_BOT', default='5/m') # Lower values pick up new blocked UAs faster; higher values reduce Redis round-trips. RATE_LIMITER_UA_BLOCKLIST_REFRESH = config('RATE_LIMITER_UA_BLOCKLIST_REFRESH', default=60, cast=int) -# Seconds between re-fetches of the runtime IP-prefix deny list from Redis DB 1 -# (rl:ip_prefix:blocked — operator-curated SET of dotted-decimal prefixes, -# e.g. '103.124.225', managed directly via SADD/SREM). +# Seconds between re-fetches of the IP-prefix deny list (rl:ip_prefix:blocked SET). RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH = config('RATE_LIMITER_IP_PREFIX_BLOCKLIST_REFRESH', default=60, cast=int) # Number of shards for the blocked-IP ZSET indexes.