api: CategoryFetcher module with HTML scrape + batch tag API + cache

New module core/api/category_fetcher.py — the unified tag-category fetcher for boorus that don't return categories inline. Public surface: try_compose_from_cache(post) — instant, no HTTP. Builds post.tag_categories from cached (site_id, name) -> label entries. Returns True if every tag in the post is cached. fetch_via_tag_api(posts) — batch fast path. Collects uncached tags across posts, chunks into 500-name batches, GETs the tag DAPI. Only available when the client declares _tag_api_url AND has credentials (Gelbooru proper). Includes JSON/XML sniffing parser ported from the reverted code. fetch_post(post) — universal fallback. HTTP GETs the post-view HTML page, regex-extracts class="tag-type-X">name</a> markup. Works on every Gelbooru fork and every Moebooru deployment. Does NOT require auth. ensure_categories(post) — idempotent dispatch: cache compose -> batch API (if available) -> HTML scrape. Coalesces concurrent calls for the same post.id via an in-flight task dict. prefetch_batch(posts) — fire-and-forget background prefetch. ONE fetch path per invocation (no mixing batch + HTML). Probe-and-cache for the batch tag API: _batch_api_works = None -> not yet probed OR transient error (retry next call) _batch_api_works = True -> batch works (Gelbooru proper) _batch_api_works = False -> clean 200 + zero matching names (Rule34's broken names= filter) Transition to True/False is permanent per instance. Transient errors (HTTP error, timeout, parse exception) leave None so the next search retries the probe. HTML regex handles both standard tag-type-artist and combined- class forms like tag-link tag-type-artist (Konachan). Tag names normalized to underscore-separated lowercase. Canonical category order: Artist > Character > Copyright > Species > General > Meta > Lore (matches danbooru/e621 inline). Dead code at this commit — no integration yet.
2026-04-09 19:12:43 -05:00 · 2026-04-09 19:12:43 -05:00 · e00d88e1ec
commit e00d88e1ec
parent 5395569213
1 changed files with 546 additions and 0 deletions
--- a/booru_viewer/core/api/category_fetcher.py
+++ b/booru_viewer/core/api/category_fetcher.py
@ -0,0 +1,546 @@
+"""Per-post HTML scrape + per-tag cache for boorus that don't return
+tag categories inline (Gelbooru-shape, Moebooru).
+
+Optionally accelerated by a batch-tag-API fast path when the attached
+BooruClient declares a ``_tag_api_url`` AND has credentials. The fast
+path fetches up to 500 tag types per request via the booru's tag DAPI,
+avoiding per-post HTML scraping entirely on sites that support it.
+
+The per-post HTML scrape path is the correctness baseline — it works on
+every Gelbooru fork and every Moebooru deployment regardless of auth or
+API quirks. The batch API is an optimization that short-circuits it
+when possible.
+
+Architectural note: Moebooru's ``/tag.json?limit=0`` returns the entire
+tag database in one request. A future "download tag database" feature
+can pre-populate ``tag_types`` via that endpoint, after which
+``try_compose_from_cache`` succeeds for every post without any per-post
+HTTP. The cache-compose fast path already supports this — no
+CategoryFetcher changes needed, just a new "populate cache from dump"
+entry point.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+import xml.etree.ElementTree as ET
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .base import BooruClient, Post
+    from ..db import Database
+
+log = logging.getLogger("booru")
+
+# ---------------------------------------------------------------------------
+# HTML parser for the universal `class="tag-type-X"` convention
+# ---------------------------------------------------------------------------
+
+# Matches both `class="tag-type-artist"` and combined-class forms like
+# `class="tag-link tag-type-artist"` (Konachan).  Captures the type
+# label and the tag name from the first <a> inside the element.
+_TAG_TYPE_RE = re.compile(
+    r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>'  # class containing tag-type-NAME
+    r'(?:[^<]*<[^>]*>)*?'                           # consume nested tags lazily
+    r'<a[^>]*>([^<]+)</a>',                          # tag name in the link
+    re.DOTALL,
+)
+
+# HTML class name -> Capitalized label (matches danbooru.py / e621.py)
+_LABEL_MAP: dict[str, str] = {
+    "general":   "General",
+    "artist":    "Artist",
+    "character": "Character",
+    "copyright": "Copyright",
+    "metadata":  "Meta",
+    "meta":      "Meta",
+    "species":   "Species",
+    "circle":    "Circle",
+    "style":     "Style",
+}
+
+# Gelbooru tag DAPI integer code -> Capitalized label (for fetch_via_tag_api)
+_GELBOORU_TYPE_MAP: dict[int, str] = {
+    0: "General",
+    1: "Artist",
+    3: "Copyright",
+    4: "Character",
+    5: "Meta",
+    # 2 = Deprecated — intentionally omitted
+}
+
+# Canonical display order for category-grouped tags.  Matches the
+# insertion order danbooru.py and e621.py produce for their inline
+# categorization, so the info panel renders consistently across all
+# booru types.
+_CATEGORY_ORDER = [
+    "Artist", "Character", "Copyright", "Species",
+    "General", "Meta", "Lore",
+]
+
+
+# ---------------------------------------------------------------------------
+# CategoryFetcher
+# ---------------------------------------------------------------------------
+
+class CategoryFetcher:
+    """Fetch and cache tag categories for boorus without inline data.
+
+    Three entry points share one cache:
+
+    * ``try_compose_from_cache`` — instant, no HTTP.
+    * ``fetch_via_tag_api`` — batch fast path for Gelbooru proper.
+    * ``fetch_post`` — per-post HTML scrape, universal fallback.
+
+    ``ensure_categories`` and ``prefetch_batch`` are the public
+    dispatch methods that route through these.
+    """
+
+    _PREFETCH_CONCURRENCY = 3  # safebooru.org soft-limits at >3
+
+    def __init__(
+        self,
+        client: "BooruClient",
+        db: "Database",
+        site_id: int,
+    ) -> None:
+        self._client = client
+        self._db = db
+        self._site_id = site_id
+        self._sem = asyncio.Semaphore(self._PREFETCH_CONCURRENCY)
+        self._inflight: dict[int, asyncio.Task] = {}
+
+        self._batch_api_works: bool | None = None
+        # Probe state for the batch tag API:
+        #
+        #   None  — not yet probed, OR last probe hit a transient
+        #           error (HTTP error, timeout, parse exception).
+        #           Next prefetch_batch will retry the probe.
+        #   True  — probe succeeded (response contained >=1 of the
+        #           requested names). Batch API used for all future
+        #           calls on this instance.
+        #   False — probe got a clean HTTP 200 with zero matching
+        #           names for ANY of the requested tags.  The API
+        #           is structurally broken on this site (Rule34's
+        #           ``names=`` filter returns unrelated tags).
+        #           Per-post HTML used for all future calls.
+        #
+        # Transition to False is permanent for the instance lifetime.
+        # Transition to True is permanent for the instance lifetime.
+        # None -> None on transient error preserves retry ability.
+
+    # ----- cache compose (instant, no HTTP) -----
+
+    def try_compose_from_cache(self, post: "Post") -> bool:
+        """Build ``post.tag_categories`` from cached labels.
+
+        Returns True if **every** tag in ``post.tag_list`` has a
+        cached label (i.e. the composition is complete).  When True
+        the post is fully categorized and no HTTP is needed.
+        """
+        tags = post.tag_list
+        if not tags:
+            return True
+        cached = self._db.get_tag_labels(self._site_id, tags)
+        if len(cached) < len(set(tags)):
+            return False
+        cats: dict[str, list[str]] = {}
+        for tag in tags:
+            label = cached.get(tag)
+            if label:
+                cats.setdefault(label, []).append(tag)
+        if cats:
+            post.tag_categories = _canonical_order(cats)
+        return True
+
+    # ----- batch tag API fast path -----
+
+    def _batch_api_available(self) -> bool:
+        """True when the attached client declares a tag API endpoint
+        AND has credentials configured."""
+        return (
+            self._client._tag_api_url() is not None
+            and bool(self._client.api_key)
+            and bool(self._client.api_user)
+        )
+
+    async def fetch_via_tag_api(self, posts: list["Post"]) -> int:
+        """Batch-fetch tag types via the booru's tag DAPI.
+
+        Collects every unique uncached tag name across ``posts``,
+        chunks into 500-name batches, GETs the tag DAPI for each
+        chunk, writes the results to the cache, then runs
+        ``try_compose_from_cache`` on every post.
+
+        Returns the count of newly-cached tags.
+        """
+        # Collect unique uncached tag names
+        all_tags: set[str] = set()
+        for p in posts:
+            all_tags.update(p.tag_list)
+        if not all_tags:
+            return 0
+        cached = self._db.get_tag_labels(self._site_id, list(all_tags))
+        missing = [t for t in all_tags if t not in cached]
+        if not missing:
+            for p in posts:
+                self.try_compose_from_cache(p)
+            return 0
+
+        tag_api_url = self._client._tag_api_url()
+        if tag_api_url is None:
+            return 0
+
+        new_labels: dict[str, str] = {}
+        BATCH = 500
+        for i in range(0, len(missing), BATCH):
+            chunk = missing[i:i + BATCH]
+            params: dict = {
+                "page": "dapi",
+                "s": "tag",
+                "q": "index",
+                "json": "1",
+                "names": " ".join(chunk),
+                "limit": len(chunk),
+            }
+            if self._client.api_key and self._client.api_user:
+                key = self._client.api_key.strip().lstrip("&")
+                user = self._client.api_user.strip().lstrip("&")
+                if key and not key.startswith("api_key="):
+                    params["api_key"] = key
+                if user and not user.startswith("user_id="):
+                    params["user_id"] = user
+            try:
+                resp = await self._client._request("GET", tag_api_url, params=params)
+                resp.raise_for_status()
+            except Exception as e:
+                log.warning("Batch tag API failed (%d names): %s: %s",
+                            len(chunk), type(e).__name__, e)
+                continue
+            for name, type_int in _parse_tag_response(resp):
+                label = _GELBOORU_TYPE_MAP.get(type_int)
+                if label:
+                    new_labels[name] = label
+
+        if new_labels:
+            self._db.set_tag_labels(self._site_id, new_labels)
+        # Compose from the now-warm cache
+        for p in posts:
+            self.try_compose_from_cache(p)
+        return len(new_labels)
+
+    # ----- per-post HTML scrape (universal fallback) -----
+
+    async def fetch_post(self, post: "Post") -> bool:
+        """Scrape the post-view HTML page for categorized tags.
+
+        Works on every Gelbooru fork and every Moebooru deployment.
+        Does NOT require auth.  Returns True on success.
+        """
+        url = self._client._post_view_url(post)
+        if url is None:
+            return False
+        async with self._sem:
+            try:
+                resp = await self._client._request("GET", url)
+                resp.raise_for_status()
+            except Exception as e:
+                log.warning("Category HTML fetch for #%d failed: %s: %s",
+                            post.id, type(e).__name__, e)
+                return False
+        cats, labels = _parse_post_html(resp.text)
+        if not cats:
+            return False
+        post.tag_categories = _canonical_order(cats)
+        if labels:
+            self._db.set_tag_labels(self._site_id, labels)
+        return True
+
+    # ----- dispatch: ensure (single post) -----
+
+    async def ensure_categories(self, post: "Post") -> None:
+        """Idempotent.  Guarantee ``post.tag_categories`` is populated.
+
+        Dispatch:
+          1. Already populated → return.
+          2. Cache compose → return if complete.
+          3. Batch tag API (if available + probe passed) → return.
+          4. Per-post HTML scrape → return.
+
+        Coalesces concurrent calls for the same ``post.id``.
+        """
+        if post.tag_categories:
+            return
+        if self.try_compose_from_cache(post):
+            return
+
+        # Coalesce: if there's an in-flight fetch for this post, await it
+        existing = self._inflight.get(post.id)
+        if existing is not None and not existing.done():
+            await existing
+            return
+
+        task = asyncio.create_task(self._do_ensure(post))
+        self._inflight[post.id] = task
+        try:
+            await task
+        finally:
+            self._inflight.pop(post.id, None)
+
+    async def _do_ensure(self, post: "Post") -> None:
+        """Inner dispatch for ensure_categories."""
+        # Batch API path (for single-post ensure, e.g. click or save)
+        if self._batch_api_works is True and self._batch_api_available():
+            await self.fetch_via_tag_api([post])
+            if post.tag_categories:
+                return
+        # HTML fallback
+        await self.fetch_post(post)
+
+    # ----- dispatch: prefetch (batch, fire-and-forget) -----
+
+    async def prefetch_batch(self, posts: list["Post"]) -> None:
+        """Background prefetch for a page of search results.
+
+        ONE fetch path per invocation — no mixing batch API + HTML
+        scrape in the same call.
+
+        Dispatch (exactly one branch executes per call):
+
+          a. ``_batch_api_works is True``
+             → ``fetch_via_tag_api`` for all uncached posts.
+
+          b. ``_batch_api_works is None`` AND capability check passes
+             → ``fetch_via_tag_api`` as the probe.
+               - HTTP 200 + >=1 requested name matched
+                 → ``_batch_api_works = True``.  Done.
+               - HTTP 200 + 0 requested names matched
+                 → ``_batch_api_works = False``.  Stop.
+                   Do NOT fall through to HTML in this call.
+               - HTTP error / timeout / parse exception
+                 → ``_batch_api_works`` stays None.  Stop.
+                   Next call retries the probe.
+
+          c. ``_batch_api_works is False``, OR no ``_tag_api_url``,
+             OR no auth
+             → per-post ``ensure_categories`` for each uncached post,
+               bounded by ``Semaphore(_PREFETCH_CONCURRENCY)``.
+        """
+        # Step 1: cache-compose everything we can
+        uncached: list["Post"] = []
+        for p in posts:
+            if p.tag_categories:
+                continue
+            if not self.try_compose_from_cache(p):
+                uncached.append(p)
+        if not uncached:
+            return
+
+        # Step 2: route decision
+        if self._batch_api_works is True and self._batch_api_available():
+            # Branch (a): batch API known to work
+            try:
+                await self.fetch_via_tag_api(uncached)
+            except Exception as e:
+                log.warning("Batch prefetch failed: %s: %s", type(e).__name__, e)
+            return
+
+        if self._batch_api_works is None and self._batch_api_available():
+            # Branch (b): probe
+            try:
+                result = await self._probe_batch_api(uncached)
+            except Exception as e:
+                # Transient error → leave _batch_api_works = None, stop
+                log.info("Batch API probe error (will retry next search): %s: %s",
+                         type(e).__name__, e)
+                return
+            if result is True:
+                # Probe succeeded — results already cached, posts composed
+                return
+            elif result is False:
+                # Probe failed cleanly — stop, don't fall through to HTML
+                return
+            else:
+                # result is None — transient, stop, retry next call
+                return
+
+        # Branch (c): per-post HTML scrape
+        tasks = []
+        for p in uncached:
+            if not p.tag_categories:
+                tasks.append(asyncio.create_task(self.ensure_categories(p)))
+        if tasks:
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+    async def _probe_batch_api(self, posts: list["Post"]) -> bool | None:
+        """Probe whether the batch tag API works on this site.
+
+        Returns:
+          True  — probe succeeded, _batch_api_works set to True,
+                  results already cached.
+          False — clean HTTP 200 with 0 matching names,
+                  _batch_api_works set to False.
+          None  — transient error, _batch_api_works stays None.
+        """
+        # Collect a sample of uncached tag names for the probe
+        all_tags: set[str] = set()
+        for p in posts:
+            all_tags.update(p.tag_list)
+        cached = self._db.get_tag_labels(self._site_id, list(all_tags))
+        missing = [t for t in all_tags if t not in cached]
+        if not missing:
+            # Everything's cached — can't probe, assume batch works
+            self._batch_api_works = True
+            for p in posts:
+                self.try_compose_from_cache(p)
+            return True
+
+        tag_api_url = self._client._tag_api_url()
+        if tag_api_url is None:
+            return None
+
+        # Send one batch request
+        chunk = missing[:500]
+        params: dict = {
+            "page": "dapi",
+            "s": "tag",
+            "q": "index",
+            "json": "1",
+            "names": " ".join(chunk),
+            "limit": len(chunk),
+        }
+        if self._client.api_key and self._client.api_user:
+            key = self._client.api_key.strip().lstrip("&")
+            user = self._client.api_user.strip().lstrip("&")
+            if key and not key.startswith("api_key="):
+                params["api_key"] = key
+            if user and not user.startswith("user_id="):
+                params["user_id"] = user
+
+        try:
+            resp = await self._client._request("GET", tag_api_url, params=params)
+        except Exception:
+            # Network/timeout error → transient, leave None
+            return None
+
+        if resp.status_code != 200:
+            # Non-200 → transient, leave None
+            return None
+
+        try:
+            entries = list(_parse_tag_response(resp))
+        except Exception:
+            # Parse error → transient, leave None
+            return None
+
+        # Check if ANY of the returned names match what we asked for
+        asked = set(chunk)
+        matched: dict[str, str] = {}
+        for name, type_int in entries:
+            label = _GELBOORU_TYPE_MAP.get(type_int)
+            if label:
+                matched[name] = label
+
+        got_any = any(n in asked for n in matched)
+
+        if got_any:
+            self._batch_api_works = True
+            if matched:
+                self._db.set_tag_labels(self._site_id, matched)
+            # Fetch any remaining missing tags via the batch path
+            await self.fetch_via_tag_api(posts)
+            return True
+        else:
+            # Clean 200 but zero matching names → structurally broken
+            self._batch_api_works = False
+            return False
+
+
+# ---------------------------------------------------------------------------
+# Parsers (module-level, stateless)
+# ---------------------------------------------------------------------------
+
+def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]:
+    """Extract tag categories from a Gelbooru-shape / Moebooru post-view page.
+
+    Returns ``(categories_dict, labels_dict)`` where:
+      - ``categories_dict`` is ``{label: [tag_names]}`` ready for
+        ``post.tag_categories``.
+      - ``labels_dict`` is ``{tag_name: label}`` ready for
+        ``db.set_tag_labels``.
+    """
+    cats: dict[str, list[str]] = {}
+    labels: dict[str, str] = {}
+    for m in _TAG_TYPE_RE.finditer(html):
+        type_class = m.group(1).lower()
+        raw_name = m.group(2).strip()
+        if not raw_name or raw_name == "?":
+            continue
+        tag_name = raw_name.replace(" ", "_").lower()
+        label = _LABEL_MAP.get(type_class)
+        if label:
+            cats.setdefault(label, []).append(tag_name)
+            labels[tag_name] = label
+    return cats, labels
+
+
+def _parse_tag_response(resp) -> list[tuple[str, int]]:
+    """Parse a Gelbooru-shaped tag DAPI response, JSON or XML.
+
+    Gelbooru proper honors ``json=1`` and returns JSON.  Rule34 and
+    Safebooru.org return XML even with ``json=1``.  We sniff the
+    body's first non-whitespace char to choose a parser.
+
+    Returns ``[(name, type_int), ...]``.
+    """
+    body = resp.text.lstrip()
+    if not body:
+        return []
+    out: list[tuple[str, int]] = []
+    if body.startswith("<"):
+        try:
+            root = ET.fromstring(body)
+        except ET.ParseError as e:
+            log.warning("Tag XML parse failed: %s", e)
+            return []
+        for tag in root.iter("tag"):
+            name = tag.get("name")
+            type_val = tag.get("type")
+            if name and type_val is not None:
+                try:
+                    out.append((name, int(type_val)))
+                except (ValueError, TypeError):
+                    pass
+    else:
+        try:
+            data = resp.json()
+        except Exception as e:
+            log.warning("Tag JSON parse failed: %s", e)
+            return []
+        if isinstance(data, dict):
+            data = data.get("tag", [])
+        if not isinstance(data, list):
+            return []
+        for entry in data:
+            name = entry.get("name")
+            type_val = entry.get("type")
+            if name and type_val is not None:
+                try:
+                    out.append((name, int(type_val)))
+                except (ValueError, TypeError):
+                    pass
+    return out
+
+
+def _canonical_order(cats: dict[str, list[str]]) -> dict[str, list[str]]:
+    """Reorder to Artist > Character > Copyright > ... > Meta."""
+    ordered: dict[str, list[str]] = {}
+    for label in _CATEGORY_ORDER:
+        if label in cats:
+            ordered[label] = cats[label]
+    for label in cats:
+        if label not in ordered:
+            ordered[label] = cats[label]
+    return ordered