From 5a511338c812fdf69cd883fcc6b30b9b991181f8 Mon Sep 17 00:00:00 2001 From: pax Date: Sat, 11 Apr 2026 16:26:00 -0500 Subject: [PATCH] =?UTF-8?q?security:=20fix=20#14=20=E2=80=94=20cap=20categ?= =?UTF-8?q?ory=5Ffetcher=20HTML=20body=20before=20regex=20walk?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CategoryFetcher.fetch_post pulls a post-view HTML page and runs _TAG_ELEMENT_RE.finditer over the full body. The regex itself is linear (no catastrophic backtracking shape), but a hostile server returning hundreds of MB of HTML still pegs CPU walking the buffer. Caps the body the regex sees at 2MB — well above any legit Gelbooru/Moebooru post page (~30-150KB). Truncation rather than streaming because httpx already buffers the body before _request returns; the cost we're cutting is the regex walk, not the memory hit. A full streaming refactor of fetch_post is a follow-up that the audit explicitly flagged as out of scope ("not catastrophic — defense in depth"). Audit-Ref: SECURITY_AUDIT.md finding #14 Severity: Informational --- booru_viewer/core/api/category_fetcher.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/booru_viewer/core/api/category_fetcher.py b/booru_viewer/core/api/category_fetcher.py index 7e3a81b..832b5e0 100644 --- a/booru_viewer/core/api/category_fetcher.py +++ b/booru_viewer/core/api/category_fetcher.py @@ -76,6 +76,13 @@ _LABEL_MAP: dict[str, str] = { "style": "Style", } +# Sentinel cap on the HTML body the regex walks over. A real +# Gelbooru/Moebooru post page is ~30-150KB; capping at 2MB gives +# any legit page comfortable headroom while preventing a hostile +# server from feeding the regex hundreds of MB and pegging CPU. +# Audit finding #14. +_FETCH_POST_HTML_CAP = 2 * 1024 * 1024 + # Gelbooru tag DAPI integer code -> Capitalized label (for fetch_via_tag_api) _GELBOORU_TYPE_MAP: dict[int, str] = { 0: "General", @@ -290,7 +297,12 @@ class CategoryFetcher: log.warning("Category HTML fetch for #%d failed: %s: %s", post.id, type(e).__name__, e) return False - cats, labels = _parse_post_html(resp.text) + # Cap the HTML the regex walks over (audit #14). Truncation + # vs. full read: the body is already buffered by httpx, so + # this doesn't prevent a memory hit — but it does cap the + # CPU spent in _TAG_ELEMENT_RE.finditer for a hostile server + # returning hundreds of MB of HTML. + cats, labels = _parse_post_html(resp.text[:_FETCH_POST_HTML_CAP]) if not cats: return False post.tag_categories = _canonical_order(cats)