category_fetcher: persist batch API probe result across sessions

The probe that detects whether a site's batch tag API works
(Gelbooru proper: yes, Rule34: no) now persists its result in the
tag_types table using a sentinel key (__batch_api_probe__). On
subsequent app launches, the fetcher reads the saved result at
construction time and skips the probe entirely.

Before: every session with Rule34 wasted ~0.6s on a probe request
that always fails (Rule34 returns garbage for names=). During that
time the background prefetch couldn't start HTML scraping, so the
first few post clicks paid ~0.3s each.

After: first ever session probes Rule34 once, stores False. Every
subsequent session reads False from DB, skips the probe, and the
background prefetch immediately starts HTML scraping. By the time
the user clicks any post, the scrape is usually done.

Gelbooru proper: probe succeeds on first session, stores True.
Future sessions use the batch API without probing. No change in
speed (already fast), just saves the probe roundtrip.

Persisted per site_id so different Gelbooru-shaped sites get their
own probe result. The clear_tag_cache method wipes probe results
along with tag data (the sentinel key lives in the same table).
This commit is contained in:
pax 2026-04-09 19:46:20 -05:00
parent 1547cbe55a
commit 7d11aeab06

View File

@ -127,24 +127,36 @@ class CategoryFetcher:
self._sem = asyncio.Semaphore(self._PREFETCH_CONCURRENCY) self._sem = asyncio.Semaphore(self._PREFETCH_CONCURRENCY)
self._inflight: dict[int, asyncio.Task] = {} self._inflight: dict[int, asyncio.Task] = {}
self._batch_api_works: bool | None = None # Probe state for the batch tag API. Persisted to DB so
# Probe state for the batch tag API: # the probe runs at most ONCE per site, ever. Rule34's
# broken batch API is detected on the first session; every
# subsequent session skips the probe and goes straight to
# HTML prefetch (saving ~0.6s of wasted probe time).
# #
# None — not yet probed, OR last probe hit a transient # None — not yet probed, OR last probe hit a transient
# error (HTTP error, timeout, parse exception). # error. Next prefetch_batch retries the probe.
# Next prefetch_batch will retry the probe. # True — probe succeeded (Gelbooru proper). Permanent.
# True — probe succeeded (response contained >=1 of the # False — clean 200 + zero matching names (Rule34).
# requested names). Batch API used for all future # Permanent. Per-post HTML from now on.
# calls on this instance. self._batch_api_works = self._load_probe_result()
# False — probe got a clean HTTP 200 with zero matching
# names for ANY of the requested tags. The API # ----- probe result persistence -----
# is structurally broken on this site (Rule34's
# ``names=`` filter returns unrelated tags). _PROBE_KEY = "__batch_api_probe__" # sentinel name in tag_types
# Per-post HTML used for all future calls.
# def _load_probe_result(self) -> bool | None:
# Transition to False is permanent for the instance lifetime. """Read the persisted probe result from the DB, or None."""
# Transition to True is permanent for the instance lifetime. row = self._db.get_tag_labels(self._site_id, [self._PROBE_KEY])
# None -> None on transient error preserves retry ability. val = row.get(self._PROBE_KEY)
if val == "true":
return True
elif val == "false":
return False
return None
def _save_probe_result(self, result: bool) -> None:
"""Persist the probe result so future sessions skip the probe."""
self._db.set_tag_labels(self._site_id, {self._PROBE_KEY: "true" if result else "false"})
# ----- cache compose (instant, no HTTP) ----- # ----- cache compose (instant, no HTTP) -----
@ -421,8 +433,10 @@ class CategoryFetcher:
cached = self._db.get_tag_labels(self._site_id, list(all_tags)) cached = self._db.get_tag_labels(self._site_id, list(all_tags))
missing = [t for t in all_tags if t not in cached] missing = [t for t in all_tags if t not in cached]
if not missing: if not missing:
# Everything's cached — can't probe, assume batch works # Everything's cached — can't probe, skip
if self._batch_api_works is None:
self._batch_api_works = True self._batch_api_works = True
self._save_probe_result(True)
for p in posts: for p in posts:
self.try_compose_from_cache(p) self.try_compose_from_cache(p)
return True return True
@ -477,6 +491,7 @@ class CategoryFetcher:
if got_any: if got_any:
self._batch_api_works = True self._batch_api_works = True
self._save_probe_result(True)
if matched: if matched:
self._db.set_tag_labels(self._site_id, matched) self._db.set_tag_labels(self._site_id, matched)
# Fetch any remaining missing tags via the batch path # Fetch any remaining missing tags via the batch path
@ -485,6 +500,7 @@ class CategoryFetcher:
else: else:
# Clean 200 but zero matching names → structurally broken # Clean 200 but zero matching names → structurally broken
self._batch_api_works = False self._batch_api_works = False
self._save_probe_result(False)
return False return False