category_fetcher: stop flipping _batch_api_works=False on transient errors in single-post path

behavior change: a single mid-call network drop could previously
poison _batch_api_works=False for the whole site, forcing every
future ensure_categories onto the slower HTML scrape path. _do_ensure
now routes the unprobed case through _probe_batch_api, which only
flips the flag on a clean HTTP 200 with zero matching names; timeout
and non-200 responses leave the flag None so the next call retries
the probe.

The bug surfaced because fetch_via_tag_api swallows per-chunk
failures with 'except Exception: continue', so the previous code
path couldn't distinguish 'API returned zero matches' from 'the
network dropped halfway through.' _probe_batch_api already made
that distinction for prefetch_batch; _do_ensure now reuses it.

Tests in tests/core/api/test_category_fetcher.py pin the three
routes (transient raise, clean-200-zero-matches, non-200).
This commit is contained in:
pax 2026-04-15 17:29:01 -05:00
parent ec9e44efbe
commit bbf0d3107b
3 changed files with 119 additions and 18 deletions

View File

@ -5,6 +5,9 @@
### Changed ### Changed
- Thumbnail drag-start threshold raised from 10px to 30px to match the rubber band's gate — small mouse wobbles on a thumb no longer trigger a file drag - Thumbnail drag-start threshold raised from 10px to 30px to match the rubber band's gate — small mouse wobbles on a thumb no longer trigger a file drag
### Fixed
- `category_fetcher._do_ensure` no longer permanently flips `_batch_api_works` to False when a transient network error drops a tag-API request mid-call; the unprobed path now routes through `_probe_batch_api`, which distinguishes clean 200-with-zero-matches (structurally broken, flip) from timeout/HTTP-error (transient, retry next call)
### Refactored ### Refactored
- `category_fetcher` batch tag-API params are now built by a shared `_build_tag_api_params` helper instead of duplicated across `fetch_via_tag_api` and `_probe_batch_api` - `category_fetcher` batch tag-API params are now built by a shared `_build_tag_api_params` helper instead of duplicated across `fetch_via_tag_api` and `_probe_batch_api`

View File

@ -357,29 +357,41 @@ class CategoryFetcher:
async def _do_ensure(self, post: "Post") -> None: async def _do_ensure(self, post: "Post") -> None:
"""Inner dispatch for ensure_categories. """Inner dispatch for ensure_categories.
Tries the batch API when it's known to work (True) OR not yet Dispatch:
probed (None). The result doubles as an inline probe: if the - ``_batch_api_works is True``: call ``fetch_via_tag_api``
batch produced categories, it works (save True); if it directly. If it populates categories we're done; a
returned nothing useful, it's broken (save False). Falls transient failure leaves them empty and we fall through
through to HTML scrape as the universal fallback. to the HTML scrape.
- ``_batch_api_works is None``: route through
``_probe_batch_api``, which only flips the flag to
True/False on a clean HTTP response. Transient errors
leave it ``None`` so the next call retries the probe.
Previously this path called ``fetch_via_tag_api`` and
inferred the result from empty ``tag_categories`` but
``fetch_via_tag_api`` swallows per-chunk failures with
``continue``, so a mid-call network drop poisoned
``_batch_api_works = False`` for the site permanently.
- ``_batch_api_works is False`` or unavailable: straight
to HTML scrape.
""" """
if self._batch_api_works is not False and self._batch_api_available(): if self._batch_api_works is True and self._batch_api_available():
try: try:
await self.fetch_via_tag_api([post]) await self.fetch_via_tag_api([post])
except Exception as e: except Exception as e:
log.debug("Batch API ensure failed (transient): %s", e) log.debug("Batch API ensure failed (transient): %s", e)
# Leave _batch_api_works at None → retry next call
else:
if post.tag_categories: if post.tag_categories:
if self._batch_api_works is None:
self._batch_api_works = True
self._save_probe_result(True)
return return
# Batch returned nothing → broken API (Rule34) or elif self._batch_api_works is None and self._batch_api_available():
# the specific post has only unknown tags (very rare). try:
if self._batch_api_works is None: result = await self._probe_batch_api([post])
self._batch_api_works = False except Exception as e:
self._save_probe_result(False) log.info("Batch API probe error (will retry next call): %s: %s",
type(e).__name__, e)
result = None
if result is True:
# Probe succeeded — results cached and post composed.
return
# result is False (broken API) or None (transient) — fall through
# HTML scrape fallback (works on Rule34/Safebooru.org/Moebooru, # HTML scrape fallback (works on Rule34/Safebooru.org/Moebooru,
# returns empty on Gelbooru proper which is fine because the # returns empty on Gelbooru proper which is fine because the
# batch path above covers Gelbooru) # batch path above covers Gelbooru)

View File

@ -454,3 +454,89 @@ class TestMaps:
assert _GELBOORU_TYPE_MAP[4] == "Character" assert _GELBOORU_TYPE_MAP[4] == "Character"
assert _GELBOORU_TYPE_MAP[5] == "Meta" assert _GELBOORU_TYPE_MAP[5] == "Meta"
assert 2 not in _GELBOORU_TYPE_MAP # Deprecated intentionally omitted assert 2 not in _GELBOORU_TYPE_MAP # Deprecated intentionally omitted
# ---------------------------------------------------------------------------
# _do_ensure dispatch — regression cover for transient-error poisoning
# ---------------------------------------------------------------------------
class TestDoEnsureProbeRouting:
"""When _batch_api_works is None, _do_ensure must route through
_probe_batch_api so transient errors stay transient. The prior
implementation called fetch_via_tag_api directly and inferred
False from empty tag_categories but fetch_via_tag_api swallows
per-chunk exceptions, so a network drop silently poisoned the
probe flag to False for the whole site."""
def test_transient_error_leaves_flag_none(self, tmp_db):
"""All chunks fail → _batch_api_works must stay None,
not flip to False."""
client = FakeClient(
tag_api_url="http://example.com/tags",
api_key="k",
api_user="u",
)
async def raising_request(method, url, params=None):
raise RuntimeError("network down")
client._request = raising_request
fetcher = CategoryFetcher(client, tmp_db, site_id=1)
assert fetcher._batch_api_works is None
post = FakePost(tags="miku 1girl")
asyncio.new_event_loop().run_until_complete(fetcher._do_ensure(post))
assert fetcher._batch_api_works is None, (
"Transient error must not poison the probe flag"
)
# Persistence side: nothing was saved
reloaded = CategoryFetcher(FakeClient(), tmp_db, site_id=1)
assert reloaded._batch_api_works is None
def test_clean_200_zero_matches_flips_to_false(self, tmp_db):
"""Clean HTTP 200 + no names matching the request → flips
the flag to False (structurally broken endpoint)."""
client = FakeClient(
tag_api_url="http://example.com/tags",
api_key="k",
api_user="u",
)
async def empty_ok_request(method, url, params=None):
# 200 with a valid but empty tag list
return FakeResponse(
json.dumps({"@attributes": {"count": 0}, "tag": []}),
status_code=200,
)
client._request = empty_ok_request
fetcher = CategoryFetcher(client, tmp_db, site_id=1)
post = FakePost(tags="definitely_not_a_real_tag")
asyncio.new_event_loop().run_until_complete(fetcher._do_ensure(post))
assert fetcher._batch_api_works is False, (
"Clean 200 with zero matches must flip flag to False"
)
reloaded = CategoryFetcher(FakeClient(), tmp_db, site_id=1)
assert reloaded._batch_api_works is False
def test_non_200_leaves_flag_none(self, tmp_db):
"""500-family responses are transient, must not poison."""
client = FakeClient(
tag_api_url="http://example.com/tags",
api_key="k",
api_user="u",
)
async def five_hundred(method, url, params=None):
return FakeResponse("", status_code=503)
client._request = five_hundred
fetcher = CategoryFetcher(client, tmp_db, site_id=1)
post = FakePost(tags="miku")
asyncio.new_event_loop().run_until_complete(fetcher._do_ensure(post))
assert fetcher._batch_api_works is None