From 5a511338c812fdf69cd883fcc6b30b9b991181f8 Mon Sep 17 00:00:00 2001
From: pax <paxxe@protonmail.com>
Date: Sat, 11 Apr 2026 16:26:00 -0500
Subject: [PATCH] =?UTF-8?q?security:=20fix=20#14=20=E2=80=94=20cap=20categ?=
 =?UTF-8?q?ory=5Ffetcher=20HTML=20body=20before=20regex=20walk?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CategoryFetcher.fetch_post pulls a post-view HTML page and runs
_TAG_ELEMENT_RE.finditer over the full body. The regex itself is
linear (no catastrophic backtracking shape), but a hostile server
returning hundreds of MB of HTML still pegs CPU walking the buffer.
Caps the body the regex sees at 2MB — well above any legit
Gelbooru/Moebooru post page (~30-150KB).

Truncation rather than streaming because httpx already buffers the
body before _request returns; the cost we're cutting is the regex
walk, not the memory hit. A full streaming refactor of fetch_post
is a follow-up that the audit explicitly flagged as out of scope
("not catastrophic — defense in depth").

Audit-Ref: SECURITY_AUDIT.md finding #14
Severity: Informational
---
 booru_viewer/core/api/category_fetcher.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/booru_viewer/core/api/category_fetcher.py b/booru_viewer/core/api/category_fetcher.py
index 7e3a81b..832b5e0 100644
--- a/booru_viewer/core/api/category_fetcher.py
+++ b/booru_viewer/core/api/category_fetcher.py
@@ -76,6 +76,13 @@ _LABEL_MAP: dict[str, str] = {
     "style":     "Style",
 }
 
+# Sentinel cap on the HTML body the regex walks over. A real
+# Gelbooru/Moebooru post page is ~30-150KB; capping at 2MB gives
+# any legit page comfortable headroom while preventing a hostile
+# server from feeding the regex hundreds of MB and pegging CPU.
+# Audit finding #14.
+_FETCH_POST_HTML_CAP = 2 * 1024 * 1024
+
 # Gelbooru tag DAPI integer code -> Capitalized label (for fetch_via_tag_api)
 _GELBOORU_TYPE_MAP: dict[int, str] = {
     0: "General",
@@ -290,7 +297,12 @@ class CategoryFetcher:
                 log.warning("Category HTML fetch for #%d failed: %s: %s",
                             post.id, type(e).__name__, e)
                 return False
-        cats, labels = _parse_post_html(resp.text)
+        # Cap the HTML the regex walks over (audit #14). Truncation
+        # vs. full read: the body is already buffered by httpx, so
+        # this doesn't prevent a memory hit — but it does cap the
+        # CPU spent in _TAG_ELEMENT_RE.finditer for a hostile server
+        # returning hundreds of MB of HTML.
+        cats, labels = _parse_post_html(resp.text[:_FETCH_POST_HTML_CAP])
         if not cats:
             return False
         post.tag_categories = _canonical_order(cats)