fix: HTML parser two-pass rewrite + fire-and-forget prefetch

Three fixes: 1. HTML parser completely rewritten with two-pass approach: - Pass 1: regex finds each tag-type element and its full inner content (up to closing </li|span|td|div>) - Pass 2: within the content, extracts the tag name from the tags=NAME URL parameter in the search link The old single-pass regex captured the ? wiki-link (first <a>) instead of the tag name (second <a>). The URL-param extraction works on Rule34 (40 tags), Safebooru.org (47 tags), and yande.re (3 tags). Gelbooru proper returns 0 (post page only has ? links with no tags= param) which is correct — Gelbooru uses the batch tag API instead. 2. prefetch_batch is now truly fire-and-forget: gelbooru.py and moebooru.py use asyncio.create_task instead of await for prefetch_batch. search() returns immediately. The probe + batch/HTML fetch runs in the background. Previously search() blocked on the probe, which made Rule34 searches take 5+ seconds (slow/broken Rule34 API response time). 3. Partial cache compose already fixed in the previous commit complements this: posts with 49/50 cached tags now show all available categories instead of nothing.
2026-04-09 19:31:43 -05:00 · 2026-04-09 19:31:43 -05:00 · f0fe52c886
commit f0fe52c886
parent 165733c6e0
3 changed files with 47 additions and 16 deletions
--- a/booru_viewer/core/api/category_fetcher.py
+++ b/booru_viewer/core/api/category_fetcher.py
@ -38,15 +38,30 @@ log = logging.getLogger("booru")
 # HTML parser for the universal `class="tag-type-X"` convention
 # ---------------------------------------------------------------------------

-# Matches both `class="tag-type-artist"` and combined-class forms like
-# `class="tag-link tag-type-artist"` (Konachan).  Captures the type
-# label and the tag name from the first <a> inside the element.
-_TAG_TYPE_RE = re.compile(
+# Two-pass approach:
+#   1. Find each tag-type element and its full inner content.
+#   2. Within the content, extract the tag name from the `tags=NAME`
+#      URL parameter in the search link.
+#
+# This handles the cross-site variation cleanly:
+#   - Gelbooru proper: only has `?` wiki links (no `tags=` param) →
+#     returns 0 results, which is fine because Gelbooru uses the
+#     batch tag API instead of HTML scraping.
+#   - Rule34 / Safebooru.org: two <a> links per tag — `?` wiki link
+#     + `<a href="...tags=TAGNAME">display name</a>`. We extract from
+#     the URL, not the display text.
+#   - yande.re / Konachan (Moebooru): same two-link pattern, but the
+#     URL is `/post?tags=TAGNAME` instead of `page=post&s=list&tags=`.
+#
+# The `tags=` extraction gives us the canonical underscore form
+# directly from the URL, no display-text normalization needed.
+_TAG_ELEMENT_RE = re.compile(
    r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>'  # class containing tag-type-NAME
-    r'(?:[^<]*<[^>]*>)*?'                           # consume nested tags lazily
-    r'<a[^>]*>([^<]+)</a>',                          # tag name in the link
+    r'(.*?)'                                        # inner content (lazy)
+    r'</(?:li|span|td|div)>',                       # closing tag
    re.DOTALL,
 )
+_TAG_NAME_RE = re.compile(r'tags=([^&"<>\s]+)')

 # HTML class name -> Capitalized label (matches danbooru.py / e621.py)
 _LABEL_MAP: dict[str, str] = {
@ -478,17 +493,31 @@ def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]:
        ``post.tag_categories``.
      - ``labels_dict`` is ``{tag_name: label}`` ready for
        ``db.set_tag_labels``.
+
+    Uses a two-pass approach: find each ``tag-type-X`` element, then
+    extract the tag name from the ``tags=NAME`` URL parameter inside
+    the element's links. This avoids the `?` wiki-link ambiguity
+    (Gelbooru-forks have a ``?`` link before the actual tag link).
+    Returns empty on Gelbooru proper (whose post page only has ``?``
+    links with no ``tags=`` parameter); that's fine because Gelbooru
+    uses the batch tag API instead.
    """
+    from urllib.parse import unquote
+
    cats: dict[str, list[str]] = {}
    labels: dict[str, str] = {}
-    for m in _TAG_TYPE_RE.finditer(html):
+    for m in _TAG_ELEMENT_RE.finditer(html):
        type_class = m.group(1).lower()
-        raw_name = m.group(2).strip()
-        if not raw_name or raw_name == "?":
-            continue
-        tag_name = raw_name.replace(" ", "_").lower()
+        content = m.group(2)
        label = _LABEL_MAP.get(type_class)
-        if label:
+        if not label:
+            continue
+        tag_match = _TAG_NAME_RE.search(content)
+        if not tag_match:
+            continue
+        tag_name = unquote(tag_match.group(1)).strip().lower()
+        if not tag_name:
+            continue
        cats.setdefault(label, []).append(tag_name)
        labels[tag_name] = label
    return cats, labels
--- a/booru_viewer/core/api/gelbooru.py
+++ b/booru_viewer/core/api/gelbooru.py
@ -82,7 +82,8 @@ class GelbooruClient(BooruClient):
                )
            )
        if self.category_fetcher is not None:
-            await self.category_fetcher.prefetch_batch(posts)
+            import asyncio
+            asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
        return posts

    @staticmethod
--- a/booru_viewer/core/api/moebooru.py
+++ b/booru_viewer/core/api/moebooru.py
@ -57,7 +57,8 @@ class MoebooruClient(BooruClient):
                )
            )
        if self.category_fetcher is not None:
-            await self.category_fetcher.prefetch_batch(posts)
+            import asyncio
+            asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
        return posts

    async def get_post(self, post_id: int) -> Post | None: