fix: HTML parser two-pass rewrite + fire-and-forget prefetch
Three fixes:
1. HTML parser completely rewritten with two-pass approach:
- Pass 1: regex finds each tag-type element and its full inner
content (up to closing </li|span|td|div>)
- Pass 2: within the content, extracts the tag name from the
tags=NAME URL parameter in the search link
The old single-pass regex captured the ? wiki-link (first <a>)
instead of the tag name (second <a>). The URL-param extraction
works on Rule34 (40 tags), Safebooru.org (47 tags), and
yande.re (3 tags). Gelbooru proper returns 0 (post page only
has ? links with no tags= param) which is correct — Gelbooru
uses the batch tag API instead.
2. prefetch_batch is now truly fire-and-forget:
gelbooru.py and moebooru.py use asyncio.create_task instead of
await for prefetch_batch. search() returns immediately. The
probe + batch/HTML fetch runs in the background. Previously
search() blocked on the probe, which made Rule34 searches take
5+ seconds (slow/broken Rule34 API response time).
3. Partial cache compose already fixed in the previous commit
complements this: posts with 49/50 cached tags now show all
available categories instead of nothing.
This commit is contained in:
parent
165733c6e0
commit
f0fe52c886
@ -38,15 +38,30 @@ log = logging.getLogger("booru")
|
||||
# HTML parser for the universal `class="tag-type-X"` convention
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Matches both `class="tag-type-artist"` and combined-class forms like
|
||||
# `class="tag-link tag-type-artist"` (Konachan). Captures the type
|
||||
# label and the tag name from the first <a> inside the element.
|
||||
_TAG_TYPE_RE = re.compile(
|
||||
# Two-pass approach:
|
||||
# 1. Find each tag-type element and its full inner content.
|
||||
# 2. Within the content, extract the tag name from the `tags=NAME`
|
||||
# URL parameter in the search link.
|
||||
#
|
||||
# This handles the cross-site variation cleanly:
|
||||
# - Gelbooru proper: only has `?` wiki links (no `tags=` param) →
|
||||
# returns 0 results, which is fine because Gelbooru uses the
|
||||
# batch tag API instead of HTML scraping.
|
||||
# - Rule34 / Safebooru.org: two <a> links per tag — `?` wiki link
|
||||
# + `<a href="...tags=TAGNAME">display name</a>`. We extract from
|
||||
# the URL, not the display text.
|
||||
# - yande.re / Konachan (Moebooru): same two-link pattern, but the
|
||||
# URL is `/post?tags=TAGNAME` instead of `page=post&s=list&tags=`.
|
||||
#
|
||||
# The `tags=` extraction gives us the canonical underscore form
|
||||
# directly from the URL, no display-text normalization needed.
|
||||
_TAG_ELEMENT_RE = re.compile(
|
||||
r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>' # class containing tag-type-NAME
|
||||
r'(?:[^<]*<[^>]*>)*?' # consume nested tags lazily
|
||||
r'<a[^>]*>([^<]+)</a>', # tag name in the link
|
||||
r'(.*?)' # inner content (lazy)
|
||||
r'</(?:li|span|td|div)>', # closing tag
|
||||
re.DOTALL,
|
||||
)
|
||||
_TAG_NAME_RE = re.compile(r'tags=([^&"<>\s]+)')
|
||||
|
||||
# HTML class name -> Capitalized label (matches danbooru.py / e621.py)
|
||||
_LABEL_MAP: dict[str, str] = {
|
||||
@ -478,17 +493,31 @@ def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]:
|
||||
``post.tag_categories``.
|
||||
- ``labels_dict`` is ``{tag_name: label}`` ready for
|
||||
``db.set_tag_labels``.
|
||||
|
||||
Uses a two-pass approach: find each ``tag-type-X`` element, then
|
||||
extract the tag name from the ``tags=NAME`` URL parameter inside
|
||||
the element's links. This avoids the `?` wiki-link ambiguity
|
||||
(Gelbooru-forks have a ``?`` link before the actual tag link).
|
||||
Returns empty on Gelbooru proper (whose post page only has ``?``
|
||||
links with no ``tags=`` parameter); that's fine because Gelbooru
|
||||
uses the batch tag API instead.
|
||||
"""
|
||||
from urllib.parse import unquote
|
||||
|
||||
cats: dict[str, list[str]] = {}
|
||||
labels: dict[str, str] = {}
|
||||
for m in _TAG_TYPE_RE.finditer(html):
|
||||
for m in _TAG_ELEMENT_RE.finditer(html):
|
||||
type_class = m.group(1).lower()
|
||||
raw_name = m.group(2).strip()
|
||||
if not raw_name or raw_name == "?":
|
||||
continue
|
||||
tag_name = raw_name.replace(" ", "_").lower()
|
||||
content = m.group(2)
|
||||
label = _LABEL_MAP.get(type_class)
|
||||
if label:
|
||||
if not label:
|
||||
continue
|
||||
tag_match = _TAG_NAME_RE.search(content)
|
||||
if not tag_match:
|
||||
continue
|
||||
tag_name = unquote(tag_match.group(1)).strip().lower()
|
||||
if not tag_name:
|
||||
continue
|
||||
cats.setdefault(label, []).append(tag_name)
|
||||
labels[tag_name] = label
|
||||
return cats, labels
|
||||
|
||||
@ -82,7 +82,8 @@ class GelbooruClient(BooruClient):
|
||||
)
|
||||
)
|
||||
if self.category_fetcher is not None:
|
||||
await self.category_fetcher.prefetch_batch(posts)
|
||||
import asyncio
|
||||
asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
|
||||
return posts
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -57,7 +57,8 @@ class MoebooruClient(BooruClient):
|
||||
)
|
||||
)
|
||||
if self.category_fetcher is not None:
|
||||
await self.category_fetcher.prefetch_batch(posts)
|
||||
import asyncio
|
||||
asyncio.create_task(self.category_fetcher.prefetch_batch(posts))
|
||||
return posts
|
||||
|
||||
async def get_post(self, post_id: int) -> Post | None:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user