fix: HTML parser two-pass rewrite + fire-and-forget prefetch

Three fixes:

1. HTML parser completely rewritten with two-pass approach:
   - Pass 1: regex finds each tag-type element and its full inner
     content (up to closing </li|span|td|div>)
   - Pass 2: within the content, extracts the tag name from the
     tags=NAME URL parameter in the search link
   The old single-pass regex captured the ? wiki-link (first <a>)
   instead of the tag name (second <a>). The URL-param extraction
   works on Rule34 (40 tags), Safebooru.org (47 tags), and
   yande.re (3 tags). Gelbooru proper returns 0 (post page only
   has ? links with no tags= param) which is correct — Gelbooru
   uses the batch tag API instead.

2. prefetch_batch is now truly fire-and-forget:
   gelbooru.py and moebooru.py use asyncio.create_task instead of
   await for prefetch_batch. search() returns immediately. The
   probe + batch/HTML fetch runs in the background. Previously
   search() blocked on the probe, which made Rule34 searches take
   5+ seconds (slow/broken Rule34 API response time).

3. Partial cache compose already fixed in the previous commit
   complements this: posts with 49/50 cached tags now show all
   available categories instead of nothing.
This commit is contained in:
pax 2026-04-09 19:31:43 -05:00
parent 165733c6e0
commit f0fe52c886
3 changed files with 47 additions and 16 deletions

View File

@ -38,15 +38,30 @@ log = logging.getLogger("booru")
# HTML parser for the universal `class="tag-type-X"` convention # HTML parser for the universal `class="tag-type-X"` convention
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Matches both `class="tag-type-artist"` and combined-class forms like # Two-pass approach:
# `class="tag-link tag-type-artist"` (Konachan). Captures the type # 1. Find each tag-type element and its full inner content.
# label and the tag name from the first <a> inside the element. # 2. Within the content, extract the tag name from the `tags=NAME`
_TAG_TYPE_RE = re.compile( # URL parameter in the search link.
#
# This handles the cross-site variation cleanly:
# - Gelbooru proper: only has `?` wiki links (no `tags=` param) →
# returns 0 results, which is fine because Gelbooru uses the
# batch tag API instead of HTML scraping.
# - Rule34 / Safebooru.org: two <a> links per tag — `?` wiki link
# + `<a href="...tags=TAGNAME">display name</a>`. We extract from
# the URL, not the display text.
# - yande.re / Konachan (Moebooru): same two-link pattern, but the
# URL is `/post?tags=TAGNAME` instead of `page=post&s=list&tags=`.
#
# The `tags=` extraction gives us the canonical underscore form
# directly from the URL, no display-text normalization needed.
_TAG_ELEMENT_RE = re.compile(
r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>' # class containing tag-type-NAME r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>' # class containing tag-type-NAME
r'(?:[^<]*<[^>]*>)*?' # consume nested tags lazily r'(.*?)' # inner content (lazy)
r'<a[^>]*>([^<]+)</a>', # tag name in the link r'</(?:li|span|td|div)>', # closing tag
re.DOTALL, re.DOTALL,
) )
_TAG_NAME_RE = re.compile(r'tags=([^&"<>\s]+)')
# HTML class name -> Capitalized label (matches danbooru.py / e621.py) # HTML class name -> Capitalized label (matches danbooru.py / e621.py)
_LABEL_MAP: dict[str, str] = { _LABEL_MAP: dict[str, str] = {
@ -478,17 +493,31 @@ def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]:
``post.tag_categories``. ``post.tag_categories``.
- ``labels_dict`` is ``{tag_name: label}`` ready for - ``labels_dict`` is ``{tag_name: label}`` ready for
``db.set_tag_labels``. ``db.set_tag_labels``.
Uses a two-pass approach: find each ``tag-type-X`` element, then
extract the tag name from the ``tags=NAME`` URL parameter inside
the element's links. This avoids the `?` wiki-link ambiguity
(Gelbooru-forks have a ``?`` link before the actual tag link).
Returns empty on Gelbooru proper (whose post page only has ``?``
links with no ``tags=`` parameter); that's fine because Gelbooru
uses the batch tag API instead.
""" """
from urllib.parse import unquote
cats: dict[str, list[str]] = {} cats: dict[str, list[str]] = {}
labels: dict[str, str] = {} labels: dict[str, str] = {}
for m in _TAG_TYPE_RE.finditer(html): for m in _TAG_ELEMENT_RE.finditer(html):
type_class = m.group(1).lower() type_class = m.group(1).lower()
raw_name = m.group(2).strip() content = m.group(2)
if not raw_name or raw_name == "?":
continue
tag_name = raw_name.replace(" ", "_").lower()
label = _LABEL_MAP.get(type_class) label = _LABEL_MAP.get(type_class)
if label: if not label:
continue
tag_match = _TAG_NAME_RE.search(content)
if not tag_match:
continue
tag_name = unquote(tag_match.group(1)).strip().lower()
if not tag_name:
continue
cats.setdefault(label, []).append(tag_name) cats.setdefault(label, []).append(tag_name)
labels[tag_name] = label labels[tag_name] = label
return cats, labels return cats, labels

View File

@ -82,7 +82,8 @@ class GelbooruClient(BooruClient):
) )
) )
if self.category_fetcher is not None: if self.category_fetcher is not None:
await self.category_fetcher.prefetch_batch(posts) import asyncio
asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
return posts return posts
@staticmethod @staticmethod

View File

@ -57,7 +57,8 @@ class MoebooruClient(BooruClient):
) )
) )
if self.category_fetcher is not None: if self.category_fetcher is not None:
await self.category_fetcher.prefetch_batch(posts) import asyncio
asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
return posts return posts
async def get_post(self, post_id: int) -> Post | None: async def get_post(self, post_id: int) -> Post | None: