api: CategoryFetcher module with HTML scrape + batch tag API + cache
New module core/api/category_fetcher.py — the unified tag-category
fetcher for boorus that don't return categories inline.
Public surface:
try_compose_from_cache(post) — instant, no HTTP. Builds
post.tag_categories from cached (site_id, name) -> label
entries. Returns True if every tag in the post is cached.
fetch_via_tag_api(posts) — batch fast path. Collects uncached
tags across posts, chunks into 500-name batches, GETs the
tag DAPI. Only available when the client declares _tag_api_url
AND has credentials (Gelbooru proper). Includes JSON/XML
sniffing parser ported from the reverted code.
fetch_post(post) — universal fallback. HTTP GETs the post-view
HTML page, regex-extracts class="tag-type-X">name</a>
markup. Works on every Gelbooru fork and every Moebooru
deployment. Does NOT require auth.
ensure_categories(post) — idempotent dispatch: cache compose ->
batch API (if available) -> HTML scrape. Coalesces concurrent
calls for the same post.id via an in-flight task dict.
prefetch_batch(posts) — fire-and-forget background prefetch.
ONE fetch path per invocation (no mixing batch + HTML).
Probe-and-cache for the batch tag API:
_batch_api_works = None -> not yet probed OR transient error
(retry next call)
_batch_api_works = True -> batch works (Gelbooru proper)
_batch_api_works = False -> clean 200 + zero matching names
(Rule34's broken names= filter)
Transition to True/False is permanent per instance. Transient
errors (HTTP error, timeout, parse exception) leave None so the
next search retries the probe.
HTML regex handles both standard tag-type-artist and combined-
class forms like tag-link tag-type-artist (Konachan). Tag names
normalized to underscore-separated lowercase.
Canonical category order: Artist > Character > Copyright >
Species > General > Meta > Lore (matches danbooru/e621 inline).
Dead code at this commit — no integration yet.
This commit is contained in:
parent
5395569213
commit
e00d88e1ec
546
booru_viewer/core/api/category_fetcher.py
Normal file
546
booru_viewer/core/api/category_fetcher.py
Normal file
@ -0,0 +1,546 @@
|
|||||||
|
"""Per-post HTML scrape + per-tag cache for boorus that don't return
|
||||||
|
tag categories inline (Gelbooru-shape, Moebooru).
|
||||||
|
|
||||||
|
Optionally accelerated by a batch-tag-API fast path when the attached
|
||||||
|
BooruClient declares a ``_tag_api_url`` AND has credentials. The fast
|
||||||
|
path fetches up to 500 tag types per request via the booru's tag DAPI,
|
||||||
|
avoiding per-post HTML scraping entirely on sites that support it.
|
||||||
|
|
||||||
|
The per-post HTML scrape path is the correctness baseline — it works on
|
||||||
|
every Gelbooru fork and every Moebooru deployment regardless of auth or
|
||||||
|
API quirks. The batch API is an optimization that short-circuits it
|
||||||
|
when possible.
|
||||||
|
|
||||||
|
Architectural note: Moebooru's ``/tag.json?limit=0`` returns the entire
|
||||||
|
tag database in one request. A future "download tag database" feature
|
||||||
|
can pre-populate ``tag_types`` via that endpoint, after which
|
||||||
|
``try_compose_from_cache`` succeeds for every post without any per-post
|
||||||
|
HTTP. The cache-compose fast path already supports this — no
|
||||||
|
CategoryFetcher changes needed, just a new "populate cache from dump"
|
||||||
|
entry point.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .base import BooruClient, Post
|
||||||
|
from ..db import Database
|
||||||
|
|
||||||
|
log = logging.getLogger("booru")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HTML parser for the universal `class="tag-type-X"` convention
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Matches both `class="tag-type-artist"` and combined-class forms like
|
||||||
|
# `class="tag-link tag-type-artist"` (Konachan). Captures the type
|
||||||
|
# label and the tag name from the first <a> inside the element.
|
||||||
|
_TAG_TYPE_RE = re.compile(
|
||||||
|
r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>' # class containing tag-type-NAME
|
||||||
|
r'(?:[^<]*<[^>]*>)*?' # consume nested tags lazily
|
||||||
|
r'<a[^>]*>([^<]+)</a>', # tag name in the link
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
# HTML class name -> Capitalized label (matches danbooru.py / e621.py)
|
||||||
|
_LABEL_MAP: dict[str, str] = {
|
||||||
|
"general": "General",
|
||||||
|
"artist": "Artist",
|
||||||
|
"character": "Character",
|
||||||
|
"copyright": "Copyright",
|
||||||
|
"metadata": "Meta",
|
||||||
|
"meta": "Meta",
|
||||||
|
"species": "Species",
|
||||||
|
"circle": "Circle",
|
||||||
|
"style": "Style",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Gelbooru tag DAPI integer code -> Capitalized label (for fetch_via_tag_api)
|
||||||
|
_GELBOORU_TYPE_MAP: dict[int, str] = {
|
||||||
|
0: "General",
|
||||||
|
1: "Artist",
|
||||||
|
3: "Copyright",
|
||||||
|
4: "Character",
|
||||||
|
5: "Meta",
|
||||||
|
# 2 = Deprecated — intentionally omitted
|
||||||
|
}
|
||||||
|
|
||||||
|
# Canonical display order for category-grouped tags. Matches the
|
||||||
|
# insertion order danbooru.py and e621.py produce for their inline
|
||||||
|
# categorization, so the info panel renders consistently across all
|
||||||
|
# booru types.
|
||||||
|
_CATEGORY_ORDER = [
|
||||||
|
"Artist", "Character", "Copyright", "Species",
|
||||||
|
"General", "Meta", "Lore",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CategoryFetcher
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class CategoryFetcher:
|
||||||
|
"""Fetch and cache tag categories for boorus without inline data.
|
||||||
|
|
||||||
|
Three entry points share one cache:
|
||||||
|
|
||||||
|
* ``try_compose_from_cache`` — instant, no HTTP.
|
||||||
|
* ``fetch_via_tag_api`` — batch fast path for Gelbooru proper.
|
||||||
|
* ``fetch_post`` — per-post HTML scrape, universal fallback.
|
||||||
|
|
||||||
|
``ensure_categories`` and ``prefetch_batch`` are the public
|
||||||
|
dispatch methods that route through these.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_PREFETCH_CONCURRENCY = 3 # safebooru.org soft-limits at >3
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
client: "BooruClient",
|
||||||
|
db: "Database",
|
||||||
|
site_id: int,
|
||||||
|
) -> None:
|
||||||
|
self._client = client
|
||||||
|
self._db = db
|
||||||
|
self._site_id = site_id
|
||||||
|
self._sem = asyncio.Semaphore(self._PREFETCH_CONCURRENCY)
|
||||||
|
self._inflight: dict[int, asyncio.Task] = {}
|
||||||
|
|
||||||
|
self._batch_api_works: bool | None = None
|
||||||
|
# Probe state for the batch tag API:
|
||||||
|
#
|
||||||
|
# None — not yet probed, OR last probe hit a transient
|
||||||
|
# error (HTTP error, timeout, parse exception).
|
||||||
|
# Next prefetch_batch will retry the probe.
|
||||||
|
# True — probe succeeded (response contained >=1 of the
|
||||||
|
# requested names). Batch API used for all future
|
||||||
|
# calls on this instance.
|
||||||
|
# False — probe got a clean HTTP 200 with zero matching
|
||||||
|
# names for ANY of the requested tags. The API
|
||||||
|
# is structurally broken on this site (Rule34's
|
||||||
|
# ``names=`` filter returns unrelated tags).
|
||||||
|
# Per-post HTML used for all future calls.
|
||||||
|
#
|
||||||
|
# Transition to False is permanent for the instance lifetime.
|
||||||
|
# Transition to True is permanent for the instance lifetime.
|
||||||
|
# None -> None on transient error preserves retry ability.
|
||||||
|
|
||||||
|
# ----- cache compose (instant, no HTTP) -----
|
||||||
|
|
||||||
|
def try_compose_from_cache(self, post: "Post") -> bool:
|
||||||
|
"""Build ``post.tag_categories`` from cached labels.
|
||||||
|
|
||||||
|
Returns True if **every** tag in ``post.tag_list`` has a
|
||||||
|
cached label (i.e. the composition is complete). When True
|
||||||
|
the post is fully categorized and no HTTP is needed.
|
||||||
|
"""
|
||||||
|
tags = post.tag_list
|
||||||
|
if not tags:
|
||||||
|
return True
|
||||||
|
cached = self._db.get_tag_labels(self._site_id, tags)
|
||||||
|
if len(cached) < len(set(tags)):
|
||||||
|
return False
|
||||||
|
cats: dict[str, list[str]] = {}
|
||||||
|
for tag in tags:
|
||||||
|
label = cached.get(tag)
|
||||||
|
if label:
|
||||||
|
cats.setdefault(label, []).append(tag)
|
||||||
|
if cats:
|
||||||
|
post.tag_categories = _canonical_order(cats)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ----- batch tag API fast path -----
|
||||||
|
|
||||||
|
def _batch_api_available(self) -> bool:
|
||||||
|
"""True when the attached client declares a tag API endpoint
|
||||||
|
AND has credentials configured."""
|
||||||
|
return (
|
||||||
|
self._client._tag_api_url() is not None
|
||||||
|
and bool(self._client.api_key)
|
||||||
|
and bool(self._client.api_user)
|
||||||
|
)
|
||||||
|
|
||||||
|
async def fetch_via_tag_api(self, posts: list["Post"]) -> int:
|
||||||
|
"""Batch-fetch tag types via the booru's tag DAPI.
|
||||||
|
|
||||||
|
Collects every unique uncached tag name across ``posts``,
|
||||||
|
chunks into 500-name batches, GETs the tag DAPI for each
|
||||||
|
chunk, writes the results to the cache, then runs
|
||||||
|
``try_compose_from_cache`` on every post.
|
||||||
|
|
||||||
|
Returns the count of newly-cached tags.
|
||||||
|
"""
|
||||||
|
# Collect unique uncached tag names
|
||||||
|
all_tags: set[str] = set()
|
||||||
|
for p in posts:
|
||||||
|
all_tags.update(p.tag_list)
|
||||||
|
if not all_tags:
|
||||||
|
return 0
|
||||||
|
cached = self._db.get_tag_labels(self._site_id, list(all_tags))
|
||||||
|
missing = [t for t in all_tags if t not in cached]
|
||||||
|
if not missing:
|
||||||
|
for p in posts:
|
||||||
|
self.try_compose_from_cache(p)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
tag_api_url = self._client._tag_api_url()
|
||||||
|
if tag_api_url is None:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
new_labels: dict[str, str] = {}
|
||||||
|
BATCH = 500
|
||||||
|
for i in range(0, len(missing), BATCH):
|
||||||
|
chunk = missing[i:i + BATCH]
|
||||||
|
params: dict = {
|
||||||
|
"page": "dapi",
|
||||||
|
"s": "tag",
|
||||||
|
"q": "index",
|
||||||
|
"json": "1",
|
||||||
|
"names": " ".join(chunk),
|
||||||
|
"limit": len(chunk),
|
||||||
|
}
|
||||||
|
if self._client.api_key and self._client.api_user:
|
||||||
|
key = self._client.api_key.strip().lstrip("&")
|
||||||
|
user = self._client.api_user.strip().lstrip("&")
|
||||||
|
if key and not key.startswith("api_key="):
|
||||||
|
params["api_key"] = key
|
||||||
|
if user and not user.startswith("user_id="):
|
||||||
|
params["user_id"] = user
|
||||||
|
try:
|
||||||
|
resp = await self._client._request("GET", tag_api_url, params=params)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Batch tag API failed (%d names): %s: %s",
|
||||||
|
len(chunk), type(e).__name__, e)
|
||||||
|
continue
|
||||||
|
for name, type_int in _parse_tag_response(resp):
|
||||||
|
label = _GELBOORU_TYPE_MAP.get(type_int)
|
||||||
|
if label:
|
||||||
|
new_labels[name] = label
|
||||||
|
|
||||||
|
if new_labels:
|
||||||
|
self._db.set_tag_labels(self._site_id, new_labels)
|
||||||
|
# Compose from the now-warm cache
|
||||||
|
for p in posts:
|
||||||
|
self.try_compose_from_cache(p)
|
||||||
|
return len(new_labels)
|
||||||
|
|
||||||
|
# ----- per-post HTML scrape (universal fallback) -----
|
||||||
|
|
||||||
|
async def fetch_post(self, post: "Post") -> bool:
|
||||||
|
"""Scrape the post-view HTML page for categorized tags.
|
||||||
|
|
||||||
|
Works on every Gelbooru fork and every Moebooru deployment.
|
||||||
|
Does NOT require auth. Returns True on success.
|
||||||
|
"""
|
||||||
|
url = self._client._post_view_url(post)
|
||||||
|
if url is None:
|
||||||
|
return False
|
||||||
|
async with self._sem:
|
||||||
|
try:
|
||||||
|
resp = await self._client._request("GET", url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Category HTML fetch for #%d failed: %s: %s",
|
||||||
|
post.id, type(e).__name__, e)
|
||||||
|
return False
|
||||||
|
cats, labels = _parse_post_html(resp.text)
|
||||||
|
if not cats:
|
||||||
|
return False
|
||||||
|
post.tag_categories = _canonical_order(cats)
|
||||||
|
if labels:
|
||||||
|
self._db.set_tag_labels(self._site_id, labels)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ----- dispatch: ensure (single post) -----
|
||||||
|
|
||||||
|
async def ensure_categories(self, post: "Post") -> None:
|
||||||
|
"""Idempotent. Guarantee ``post.tag_categories`` is populated.
|
||||||
|
|
||||||
|
Dispatch:
|
||||||
|
1. Already populated → return.
|
||||||
|
2. Cache compose → return if complete.
|
||||||
|
3. Batch tag API (if available + probe passed) → return.
|
||||||
|
4. Per-post HTML scrape → return.
|
||||||
|
|
||||||
|
Coalesces concurrent calls for the same ``post.id``.
|
||||||
|
"""
|
||||||
|
if post.tag_categories:
|
||||||
|
return
|
||||||
|
if self.try_compose_from_cache(post):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Coalesce: if there's an in-flight fetch for this post, await it
|
||||||
|
existing = self._inflight.get(post.id)
|
||||||
|
if existing is not None and not existing.done():
|
||||||
|
await existing
|
||||||
|
return
|
||||||
|
|
||||||
|
task = asyncio.create_task(self._do_ensure(post))
|
||||||
|
self._inflight[post.id] = task
|
||||||
|
try:
|
||||||
|
await task
|
||||||
|
finally:
|
||||||
|
self._inflight.pop(post.id, None)
|
||||||
|
|
||||||
|
async def _do_ensure(self, post: "Post") -> None:
|
||||||
|
"""Inner dispatch for ensure_categories."""
|
||||||
|
# Batch API path (for single-post ensure, e.g. click or save)
|
||||||
|
if self._batch_api_works is True and self._batch_api_available():
|
||||||
|
await self.fetch_via_tag_api([post])
|
||||||
|
if post.tag_categories:
|
||||||
|
return
|
||||||
|
# HTML fallback
|
||||||
|
await self.fetch_post(post)
|
||||||
|
|
||||||
|
# ----- dispatch: prefetch (batch, fire-and-forget) -----
|
||||||
|
|
||||||
|
async def prefetch_batch(self, posts: list["Post"]) -> None:
|
||||||
|
"""Background prefetch for a page of search results.
|
||||||
|
|
||||||
|
ONE fetch path per invocation — no mixing batch API + HTML
|
||||||
|
scrape in the same call.
|
||||||
|
|
||||||
|
Dispatch (exactly one branch executes per call):
|
||||||
|
|
||||||
|
a. ``_batch_api_works is True``
|
||||||
|
→ ``fetch_via_tag_api`` for all uncached posts.
|
||||||
|
|
||||||
|
b. ``_batch_api_works is None`` AND capability check passes
|
||||||
|
→ ``fetch_via_tag_api`` as the probe.
|
||||||
|
- HTTP 200 + >=1 requested name matched
|
||||||
|
→ ``_batch_api_works = True``. Done.
|
||||||
|
- HTTP 200 + 0 requested names matched
|
||||||
|
→ ``_batch_api_works = False``. Stop.
|
||||||
|
Do NOT fall through to HTML in this call.
|
||||||
|
- HTTP error / timeout / parse exception
|
||||||
|
→ ``_batch_api_works`` stays None. Stop.
|
||||||
|
Next call retries the probe.
|
||||||
|
|
||||||
|
c. ``_batch_api_works is False``, OR no ``_tag_api_url``,
|
||||||
|
OR no auth
|
||||||
|
→ per-post ``ensure_categories`` for each uncached post,
|
||||||
|
bounded by ``Semaphore(_PREFETCH_CONCURRENCY)``.
|
||||||
|
"""
|
||||||
|
# Step 1: cache-compose everything we can
|
||||||
|
uncached: list["Post"] = []
|
||||||
|
for p in posts:
|
||||||
|
if p.tag_categories:
|
||||||
|
continue
|
||||||
|
if not self.try_compose_from_cache(p):
|
||||||
|
uncached.append(p)
|
||||||
|
if not uncached:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2: route decision
|
||||||
|
if self._batch_api_works is True and self._batch_api_available():
|
||||||
|
# Branch (a): batch API known to work
|
||||||
|
try:
|
||||||
|
await self.fetch_via_tag_api(uncached)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Batch prefetch failed: %s: %s", type(e).__name__, e)
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._batch_api_works is None and self._batch_api_available():
|
||||||
|
# Branch (b): probe
|
||||||
|
try:
|
||||||
|
result = await self._probe_batch_api(uncached)
|
||||||
|
except Exception as e:
|
||||||
|
# Transient error → leave _batch_api_works = None, stop
|
||||||
|
log.info("Batch API probe error (will retry next search): %s: %s",
|
||||||
|
type(e).__name__, e)
|
||||||
|
return
|
||||||
|
if result is True:
|
||||||
|
# Probe succeeded — results already cached, posts composed
|
||||||
|
return
|
||||||
|
elif result is False:
|
||||||
|
# Probe failed cleanly — stop, don't fall through to HTML
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# result is None — transient, stop, retry next call
|
||||||
|
return
|
||||||
|
|
||||||
|
# Branch (c): per-post HTML scrape
|
||||||
|
tasks = []
|
||||||
|
for p in uncached:
|
||||||
|
if not p.tag_categories:
|
||||||
|
tasks.append(asyncio.create_task(self.ensure_categories(p)))
|
||||||
|
if tasks:
|
||||||
|
await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
async def _probe_batch_api(self, posts: list["Post"]) -> bool | None:
|
||||||
|
"""Probe whether the batch tag API works on this site.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True — probe succeeded, _batch_api_works set to True,
|
||||||
|
results already cached.
|
||||||
|
False — clean HTTP 200 with 0 matching names,
|
||||||
|
_batch_api_works set to False.
|
||||||
|
None — transient error, _batch_api_works stays None.
|
||||||
|
"""
|
||||||
|
# Collect a sample of uncached tag names for the probe
|
||||||
|
all_tags: set[str] = set()
|
||||||
|
for p in posts:
|
||||||
|
all_tags.update(p.tag_list)
|
||||||
|
cached = self._db.get_tag_labels(self._site_id, list(all_tags))
|
||||||
|
missing = [t for t in all_tags if t not in cached]
|
||||||
|
if not missing:
|
||||||
|
# Everything's cached — can't probe, assume batch works
|
||||||
|
self._batch_api_works = True
|
||||||
|
for p in posts:
|
||||||
|
self.try_compose_from_cache(p)
|
||||||
|
return True
|
||||||
|
|
||||||
|
tag_api_url = self._client._tag_api_url()
|
||||||
|
if tag_api_url is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Send one batch request
|
||||||
|
chunk = missing[:500]
|
||||||
|
params: dict = {
|
||||||
|
"page": "dapi",
|
||||||
|
"s": "tag",
|
||||||
|
"q": "index",
|
||||||
|
"json": "1",
|
||||||
|
"names": " ".join(chunk),
|
||||||
|
"limit": len(chunk),
|
||||||
|
}
|
||||||
|
if self._client.api_key and self._client.api_user:
|
||||||
|
key = self._client.api_key.strip().lstrip("&")
|
||||||
|
user = self._client.api_user.strip().lstrip("&")
|
||||||
|
if key and not key.startswith("api_key="):
|
||||||
|
params["api_key"] = key
|
||||||
|
if user and not user.startswith("user_id="):
|
||||||
|
params["user_id"] = user
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = await self._client._request("GET", tag_api_url, params=params)
|
||||||
|
except Exception:
|
||||||
|
# Network/timeout error → transient, leave None
|
||||||
|
return None
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
# Non-200 → transient, leave None
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
entries = list(_parse_tag_response(resp))
|
||||||
|
except Exception:
|
||||||
|
# Parse error → transient, leave None
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check if ANY of the returned names match what we asked for
|
||||||
|
asked = set(chunk)
|
||||||
|
matched: dict[str, str] = {}
|
||||||
|
for name, type_int in entries:
|
||||||
|
label = _GELBOORU_TYPE_MAP.get(type_int)
|
||||||
|
if label:
|
||||||
|
matched[name] = label
|
||||||
|
|
||||||
|
got_any = any(n in asked for n in matched)
|
||||||
|
|
||||||
|
if got_any:
|
||||||
|
self._batch_api_works = True
|
||||||
|
if matched:
|
||||||
|
self._db.set_tag_labels(self._site_id, matched)
|
||||||
|
# Fetch any remaining missing tags via the batch path
|
||||||
|
await self.fetch_via_tag_api(posts)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
# Clean 200 but zero matching names → structurally broken
|
||||||
|
self._batch_api_works = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Parsers (module-level, stateless)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]:
|
||||||
|
"""Extract tag categories from a Gelbooru-shape / Moebooru post-view page.
|
||||||
|
|
||||||
|
Returns ``(categories_dict, labels_dict)`` where:
|
||||||
|
- ``categories_dict`` is ``{label: [tag_names]}`` ready for
|
||||||
|
``post.tag_categories``.
|
||||||
|
- ``labels_dict`` is ``{tag_name: label}`` ready for
|
||||||
|
``db.set_tag_labels``.
|
||||||
|
"""
|
||||||
|
cats: dict[str, list[str]] = {}
|
||||||
|
labels: dict[str, str] = {}
|
||||||
|
for m in _TAG_TYPE_RE.finditer(html):
|
||||||
|
type_class = m.group(1).lower()
|
||||||
|
raw_name = m.group(2).strip()
|
||||||
|
if not raw_name or raw_name == "?":
|
||||||
|
continue
|
||||||
|
tag_name = raw_name.replace(" ", "_").lower()
|
||||||
|
label = _LABEL_MAP.get(type_class)
|
||||||
|
if label:
|
||||||
|
cats.setdefault(label, []).append(tag_name)
|
||||||
|
labels[tag_name] = label
|
||||||
|
return cats, labels
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_tag_response(resp) -> list[tuple[str, int]]:
|
||||||
|
"""Parse a Gelbooru-shaped tag DAPI response, JSON or XML.
|
||||||
|
|
||||||
|
Gelbooru proper honors ``json=1`` and returns JSON. Rule34 and
|
||||||
|
Safebooru.org return XML even with ``json=1``. We sniff the
|
||||||
|
body's first non-whitespace char to choose a parser.
|
||||||
|
|
||||||
|
Returns ``[(name, type_int), ...]``.
|
||||||
|
"""
|
||||||
|
body = resp.text.lstrip()
|
||||||
|
if not body:
|
||||||
|
return []
|
||||||
|
out: list[tuple[str, int]] = []
|
||||||
|
if body.startswith("<"):
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(body)
|
||||||
|
except ET.ParseError as e:
|
||||||
|
log.warning("Tag XML parse failed: %s", e)
|
||||||
|
return []
|
||||||
|
for tag in root.iter("tag"):
|
||||||
|
name = tag.get("name")
|
||||||
|
type_val = tag.get("type")
|
||||||
|
if name and type_val is not None:
|
||||||
|
try:
|
||||||
|
out.append((name, int(type_val)))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
data = resp.json()
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Tag JSON parse failed: %s", e)
|
||||||
|
return []
|
||||||
|
if isinstance(data, dict):
|
||||||
|
data = data.get("tag", [])
|
||||||
|
if not isinstance(data, list):
|
||||||
|
return []
|
||||||
|
for entry in data:
|
||||||
|
name = entry.get("name")
|
||||||
|
type_val = entry.get("type")
|
||||||
|
if name and type_val is not None:
|
||||||
|
try:
|
||||||
|
out.append((name, int(type_val)))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _canonical_order(cats: dict[str, list[str]]) -> dict[str, list[str]]:
|
||||||
|
"""Reorder to Artist > Character > Copyright > ... > Meta."""
|
||||||
|
ordered: dict[str, list[str]] = {}
|
||||||
|
for label in _CATEGORY_ORDER:
|
||||||
|
if label in cats:
|
||||||
|
ordered[label] = cats[label]
|
||||||
|
for label in cats:
|
||||||
|
if label not in ordered:
|
||||||
|
ordered[label] = cats[label]
|
||||||
|
return ordered
|
||||||
Loading…
x
Reference in New Issue
Block a user