From f0fe52c886377763d2070c0bfc499dda74615edd Mon Sep 17 00:00:00 2001 From: pax Date: Thu, 9 Apr 2026 19:31:43 -0500 Subject: [PATCH] fix: HTML parser two-pass rewrite + fire-and-forget prefetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes: 1. HTML parser completely rewritten with two-pass approach: - Pass 1: regex finds each tag-type element and its full inner content (up to closing ) - Pass 2: within the content, extracts the tag name from the tags=NAME URL parameter in the search link The old single-pass regex captured the ? wiki-link (first ) instead of the tag name (second ). The URL-param extraction works on Rule34 (40 tags), Safebooru.org (47 tags), and yande.re (3 tags). Gelbooru proper returns 0 (post page only has ? links with no tags= param) which is correct — Gelbooru uses the batch tag API instead. 2. prefetch_batch is now truly fire-and-forget: gelbooru.py and moebooru.py use asyncio.create_task instead of await for prefetch_batch. search() returns immediately. The probe + batch/HTML fetch runs in the background. Previously search() blocked on the probe, which made Rule34 searches take 5+ seconds (slow/broken Rule34 API response time). 3. Partial cache compose already fixed in the previous commit complements this: posts with 49/50 cached tags now show all available categories instead of nothing. --- booru_viewer/core/api/category_fetcher.py | 57 +++++++++++++++++------ booru_viewer/core/api/gelbooru.py | 3 +- booru_viewer/core/api/moebooru.py | 3 +- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/booru_viewer/core/api/category_fetcher.py b/booru_viewer/core/api/category_fetcher.py index 07f5e54..b2f2eca 100644 --- a/booru_viewer/core/api/category_fetcher.py +++ b/booru_viewer/core/api/category_fetcher.py @@ -38,15 +38,30 @@ log = logging.getLogger("booru") # HTML parser for the universal `class="tag-type-X"` convention # --------------------------------------------------------------------------- -# Matches both `class="tag-type-artist"` and combined-class forms like -# `class="tag-link tag-type-artist"` (Konachan). Captures the type -# label and the tag name from the first inside the element. -_TAG_TYPE_RE = re.compile( +# Two-pass approach: +# 1. Find each tag-type element and its full inner content. +# 2. Within the content, extract the tag name from the `tags=NAME` +# URL parameter in the search link. +# +# This handles the cross-site variation cleanly: +# - Gelbooru proper: only has `?` wiki links (no `tags=` param) → +# returns 0 results, which is fine because Gelbooru uses the +# batch tag API instead of HTML scraping. +# - Rule34 / Safebooru.org: two links per tag — `?` wiki link +# + `display name`. We extract from +# the URL, not the display text. +# - yande.re / Konachan (Moebooru): same two-link pattern, but the +# URL is `/post?tags=TAGNAME` instead of `page=post&s=list&tags=`. +# +# The `tags=` extraction gives us the canonical underscore form +# directly from the URL, no display-text normalization needed. +_TAG_ELEMENT_RE = re.compile( r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>' # class containing tag-type-NAME - r'(?:[^<]*<[^>]*>)*?' # consume nested tags lazily - r']*>([^<]+)', # tag name in the link + r'(.*?)' # inner content (lazy) + r'', # closing tag re.DOTALL, ) +_TAG_NAME_RE = re.compile(r'tags=([^&"<>\s]+)') # HTML class name -> Capitalized label (matches danbooru.py / e621.py) _LABEL_MAP: dict[str, str] = { @@ -478,19 +493,33 @@ def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]: ``post.tag_categories``. - ``labels_dict`` is ``{tag_name: label}`` ready for ``db.set_tag_labels``. + + Uses a two-pass approach: find each ``tag-type-X`` element, then + extract the tag name from the ``tags=NAME`` URL parameter inside + the element's links. This avoids the `?` wiki-link ambiguity + (Gelbooru-forks have a ``?`` link before the actual tag link). + Returns empty on Gelbooru proper (whose post page only has ``?`` + links with no ``tags=`` parameter); that's fine because Gelbooru + uses the batch tag API instead. """ + from urllib.parse import unquote + cats: dict[str, list[str]] = {} labels: dict[str, str] = {} - for m in _TAG_TYPE_RE.finditer(html): + for m in _TAG_ELEMENT_RE.finditer(html): type_class = m.group(1).lower() - raw_name = m.group(2).strip() - if not raw_name or raw_name == "?": - continue - tag_name = raw_name.replace(" ", "_").lower() + content = m.group(2) label = _LABEL_MAP.get(type_class) - if label: - cats.setdefault(label, []).append(tag_name) - labels[tag_name] = label + if not label: + continue + tag_match = _TAG_NAME_RE.search(content) + if not tag_match: + continue + tag_name = unquote(tag_match.group(1)).strip().lower() + if not tag_name: + continue + cats.setdefault(label, []).append(tag_name) + labels[tag_name] = label return cats, labels diff --git a/booru_viewer/core/api/gelbooru.py b/booru_viewer/core/api/gelbooru.py index f47e850..3544c0f 100644 --- a/booru_viewer/core/api/gelbooru.py +++ b/booru_viewer/core/api/gelbooru.py @@ -82,7 +82,8 @@ class GelbooruClient(BooruClient): ) ) if self.category_fetcher is not None: - await self.category_fetcher.prefetch_batch(posts) + import asyncio + asyncio.create_task(self.category_fetcher.prefetch_batch(posts)) return posts @staticmethod diff --git a/booru_viewer/core/api/moebooru.py b/booru_viewer/core/api/moebooru.py index 05d3ced..e7a404c 100644 --- a/booru_viewer/core/api/moebooru.py +++ b/booru_viewer/core/api/moebooru.py @@ -57,7 +57,8 @@ class MoebooruClient(BooruClient): ) ) if self.category_fetcher is not None: - await self.category_fetcher.prefetch_batch(posts) + import asyncio + asyncio.create_task(self.category_fetcher.prefetch_batch(posts)) return posts async def get_post(self, post_id: int) -> Post | None: