fix: HTML parser two-pass rewrite + fire-and-forget prefetch

Three fixes: 1. HTML parser completely rewritten with two-pass approach: - Pass 1: regex finds each tag-type element and its full inner content (up to closing </li|span|td|div>) - Pass 2: within the content, extracts the tag name from the tags=NAME URL parameter in the search link The old single-pass regex captured the ? wiki-link (first <a>) instead of the tag name (second <a>). The URL-param extraction works on Rule34 (40 tags), Safebooru.org (47 tags), and yande.re (3 tags). Gelbooru proper returns 0 (post page only has ? links with no tags= param) which is correct — Gelbooru uses the batch tag API instead. 2. prefetch_batch is now truly fire-and-forget: gelbooru.py and moebooru.py use asyncio.create_task instead of await for prefetch_batch. search() returns immediately. The probe + batch/HTML fetch runs in the background. Previously search() blocked on the probe, which made Rule34 searches take 5+ seconds (slow/broken Rule34 API response time). 3. Partial cache compose already fixed in the previous commit complements this: posts with 49/50 cached tags now show all available categories instead of nothing.
2026-04-09 19:31:43 -05:00 · 2026-04-09 19:31:43 -05:00 · f0fe52c886
commit f0fe52c886
parent 165733c6e0
3 changed files with 47 additions and 16 deletions
--- a/booru_viewer/core/api/category_fetcher.py
+++ b/booru_viewer/core/api/category_fetcher.py
@ -38,15 +38,30 @@ log = logging.getLogger("booru")
 # HTML parser for the universal `class="tag-type-X"` convention
 # ---------------------------------------------------------------------------
-# Matches both `class="tag-type-artist"` and combined-class forms like
+# Two-pass approach:
-# `class="tag-link tag-type-artist"` (Konachan).  Captures the type
+#   1. Find each tag-type element and its full inner content.
-# label and the tag name from the first <a> inside the element.
+#   2. Within the content, extract the tag name from the `tags=NAME`
-_TAG_TYPE_RE = re.compile(
+#      URL parameter in the search link.
 #
 # This handles the cross-site variation cleanly:
 #   - Gelbooru proper: only has `?` wiki links (no `tags=` param) →
 #     returns 0 results, which is fine because Gelbooru uses the
 #     batch tag API instead of HTML scraping.
 #   - Rule34 / Safebooru.org: two <a> links per tag — `?` wiki link
 #     + `<a href="...tags=TAGNAME">display name</a>`. We extract from
 #     the URL, not the display text.
 #   - yande.re / Konachan (Moebooru): same two-link pattern, but the
 #     URL is `/post?tags=TAGNAME` instead of `page=post&s=list&tags=`.
 #
 # The `tags=` extraction gives us the canonical underscore form
 # directly from the URL, no display-text normalization needed.
 _TAG_ELEMENT_RE = re.compile(
    r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>'  # class containing tag-type-NAME
-    r'(?:[^<]*<[^>]*>)*?'                           # consume nested tags lazily
+    r'(.*?)'                                        # inner content (lazy)
-    r'<a[^>]*>([^<]+)</a>',                          # tag name in the link
+    r'</(?:li|span|td|div)>',                       # closing tag
    re.DOTALL,
 )
 _TAG_NAME_RE = re.compile(r'tags=([^&"<>\s]+)')
 # HTML class name -> Capitalized label (matches danbooru.py / e621.py)
 _LABEL_MAP: dict[str, str] = {
@ -478,17 +493,31 @@ def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]:
        ``post.tag_categories``.
      - ``labels_dict`` is ``{tag_name: label}`` ready for
        ``db.set_tag_labels``.
    Uses a two-pass approach: find each ``tag-type-X`` element, then
    extract the tag name from the ``tags=NAME`` URL parameter inside
    the element's links. This avoids the `?` wiki-link ambiguity
    (Gelbooru-forks have a ``?`` link before the actual tag link).
    Returns empty on Gelbooru proper (whose post page only has ``?``
    links with no ``tags=`` parameter); that's fine because Gelbooru
    uses the batch tag API instead.
    """
    from urllib.parse import unquote
    cats: dict[str, list[str]] = {}
    labels: dict[str, str] = {}
-    for m in _TAG_TYPE_RE.finditer(html):
+    for m in _TAG_ELEMENT_RE.finditer(html):
        type_class = m.group(1).lower()
-        raw_name = m.group(2).strip()
+        content = m.group(2)
        if not raw_name or raw_name == "?":
            continue
        tag_name = raw_name.replace(" ", "_").lower()
        label = _LABEL_MAP.get(type_class)
-        if label:
+        if not label:
            continue
        tag_match = _TAG_NAME_RE.search(content)
        if not tag_match:
            continue
        tag_name = unquote(tag_match.group(1)).strip().lower()
        if not tag_name:
            continue
        cats.setdefault(label, []).append(tag_name)
        labels[tag_name] = label
    return cats, labels
--- a/booru_viewer/core/api/gelbooru.py
+++ b/booru_viewer/core/api/gelbooru.py
@ -82,7 +82,8 @@ class GelbooruClient(BooruClient):
                )
            )
        if self.category_fetcher is not None:
-            await self.category_fetcher.prefetch_batch(posts)
+            import asyncio
            asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
        return posts
    @staticmethod
--- a/booru_viewer/core/api/moebooru.py
+++ b/booru_viewer/core/api/moebooru.py
@ -57,7 +57,8 @@ class MoebooruClient(BooruClient):
                )
            )
        if self.category_fetcher is not None:
-            await self.category_fetcher.prefetch_batch(posts)
+            import asyncio
            asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
        return posts
    async def get_post(self, post_id: int) -> Post | None: