diff --git a/booru_viewer/core/api/category_fetcher.py b/booru_viewer/core/api/category_fetcher.py index 07f5e54..b2f2eca 100644 --- a/booru_viewer/core/api/category_fetcher.py +++ b/booru_viewer/core/api/category_fetcher.py @@ -38,15 +38,30 @@ log = logging.getLogger("booru") # HTML parser for the universal `class="tag-type-X"` convention # --------------------------------------------------------------------------- -# Matches both `class="tag-type-artist"` and combined-class forms like -# `class="tag-link tag-type-artist"` (Konachan). Captures the type -# label and the tag name from the first inside the element. -_TAG_TYPE_RE = re.compile( +# Two-pass approach: +# 1. Find each tag-type element and its full inner content. +# 2. Within the content, extract the tag name from the `tags=NAME` +# URL parameter in the search link. +# +# This handles the cross-site variation cleanly: +# - Gelbooru proper: only has `?` wiki links (no `tags=` param) → +# returns 0 results, which is fine because Gelbooru uses the +# batch tag API instead of HTML scraping. +# - Rule34 / Safebooru.org: two links per tag — `?` wiki link +# + `display name`. We extract from +# the URL, not the display text. +# - yande.re / Konachan (Moebooru): same two-link pattern, but the +# URL is `/post?tags=TAGNAME` instead of `page=post&s=list&tags=`. +# +# The `tags=` extraction gives us the canonical underscore form +# directly from the URL, no display-text normalization needed. +_TAG_ELEMENT_RE = re.compile( r'class="[^"]*tag-type-([a-z]+)[^"]*"[^>]*>' # class containing tag-type-NAME - r'(?:[^<]*<[^>]*>)*?' # consume nested tags lazily - r']*>([^<]+)', # tag name in the link + r'(.*?)' # inner content (lazy) + r'', # closing tag re.DOTALL, ) +_TAG_NAME_RE = re.compile(r'tags=([^&"<>\s]+)') # HTML class name -> Capitalized label (matches danbooru.py / e621.py) _LABEL_MAP: dict[str, str] = { @@ -478,19 +493,33 @@ def _parse_post_html(html: str) -> tuple[dict[str, list[str]], dict[str, str]]: ``post.tag_categories``. - ``labels_dict`` is ``{tag_name: label}`` ready for ``db.set_tag_labels``. + + Uses a two-pass approach: find each ``tag-type-X`` element, then + extract the tag name from the ``tags=NAME`` URL parameter inside + the element's links. This avoids the `?` wiki-link ambiguity + (Gelbooru-forks have a ``?`` link before the actual tag link). + Returns empty on Gelbooru proper (whose post page only has ``?`` + links with no ``tags=`` parameter); that's fine because Gelbooru + uses the batch tag API instead. """ + from urllib.parse import unquote + cats: dict[str, list[str]] = {} labels: dict[str, str] = {} - for m in _TAG_TYPE_RE.finditer(html): + for m in _TAG_ELEMENT_RE.finditer(html): type_class = m.group(1).lower() - raw_name = m.group(2).strip() - if not raw_name or raw_name == "?": - continue - tag_name = raw_name.replace(" ", "_").lower() + content = m.group(2) label = _LABEL_MAP.get(type_class) - if label: - cats.setdefault(label, []).append(tag_name) - labels[tag_name] = label + if not label: + continue + tag_match = _TAG_NAME_RE.search(content) + if not tag_match: + continue + tag_name = unquote(tag_match.group(1)).strip().lower() + if not tag_name: + continue + cats.setdefault(label, []).append(tag_name) + labels[tag_name] = label return cats, labels diff --git a/booru_viewer/core/api/gelbooru.py b/booru_viewer/core/api/gelbooru.py index f47e850..3544c0f 100644 --- a/booru_viewer/core/api/gelbooru.py +++ b/booru_viewer/core/api/gelbooru.py @@ -82,7 +82,8 @@ class GelbooruClient(BooruClient): ) ) if self.category_fetcher is not None: - await self.category_fetcher.prefetch_batch(posts) + import asyncio + asyncio.create_task(self.category_fetcher.prefetch_batch(posts)) return posts @staticmethod diff --git a/booru_viewer/core/api/moebooru.py b/booru_viewer/core/api/moebooru.py index 05d3ced..e7a404c 100644 --- a/booru_viewer/core/api/moebooru.py +++ b/booru_viewer/core/api/moebooru.py @@ -57,7 +57,8 @@ class MoebooruClient(BooruClient): ) ) if self.category_fetcher is not None: - await self.category_fetcher.prefetch_batch(posts) + import asyncio + asyncio.create_task(self.category_fetcher.prefetch_batch(posts)) return posts async def get_post(self, post_id: int) -> Post | None: