db: re-add tag_types cache table with string labels + auto-prune

Per-site tag-type cache for boorus that don't return categories
inline. Uses string labels ("Artist", "Character", "Copyright",
"General", "Meta") instead of the integer codes the reverted
version used — the labels come directly from HTML class names,
no mapping step needed.

Schema: tag_types(site_id, name, label TEXT, fetched_at)
        PRIMARY KEY (site_id, name)

Methods:
  get_tag_labels(site_id, names) — chunked 500-name SELECT
  set_tag_labels(site_id, mapping) — bulk INSERT OR REPLACE,
    auto-prunes oldest entries when the table exceeds 50k rows
  clear_tag_cache(site_id=None) — manual wipe, for future
    Settings UI "Clear tag cache" button

The 50k row cap prevents unbounded growth over months of
browsing multiple boorus. Normal usage (a few thousand unique
tags per site) never reaches it. When exceeded, the oldest
entries by fetched_at are pruned first — these are the tags the
user hasn't encountered recently and would be re-fetched cheaply
if needed.

Migration: CREATE TABLE IF NOT EXISTS in _migrate(), non-breaking
for existing databases.
This commit is contained in:
pax 2026-04-09 19:10:37 -05:00
parent 81fc4d93eb
commit 5395569213

View File

@ -124,6 +124,14 @@ CREATE TABLE IF NOT EXISTS saved_searches (
query TEXT NOT NULL,
site_id INTEGER
);
CREATE TABLE IF NOT EXISTS tag_types (
site_id INTEGER NOT NULL,
name TEXT NOT NULL,
label TEXT NOT NULL,
fetched_at TEXT NOT NULL,
PRIMARY KEY (site_id, name)
);
"""
_DEFAULTS = {
@ -252,6 +260,21 @@ class Database:
# Add tag_categories to favorites if missing
if "tag_categories" not in cols:
self._conn.execute("ALTER TABLE favorites ADD COLUMN tag_categories TEXT DEFAULT ''")
# Tag-type cache for boorus that don't return
# categorized tags inline (Gelbooru-shape, Moebooru).
# Per-site keying so forks don't cross-contaminate.
# Uses string labels ("Artist", "Character", ...)
# instead of integer codes — the labels come from
# the HTML class names directly.
self._conn.execute("""
CREATE TABLE IF NOT EXISTS tag_types (
site_id INTEGER NOT NULL,
name TEXT NOT NULL,
label TEXT NOT NULL,
fetched_at TEXT NOT NULL,
PRIMARY KEY (site_id, name)
)
""")
def close(self) -> None:
if self._conn:
@ -727,6 +750,81 @@ class Database:
with self._write():
self.conn.execute("DELETE FROM library_meta WHERE post_id = ?", (post_id,))
# -- Tag-type cache --
def get_tag_labels(self, site_id: int, names: list[str]) -> dict[str, str]:
"""Return cached string labels for `names` on `site_id`.
Result dict only contains tags with a cache entry callers
fetch the misses via CategoryFetcher and call set_tag_labels
to backfill. Chunked to stay under SQLite's variable limit.
"""
if not names:
return {}
result: dict[str, str] = {}
BATCH = 500
for i in range(0, len(names), BATCH):
chunk = names[i:i + BATCH]
placeholders = ",".join("?" * len(chunk))
rows = self.conn.execute(
f"SELECT name, label FROM tag_types WHERE site_id = ? AND name IN ({placeholders})",
[site_id, *chunk],
).fetchall()
for r in rows:
result[r["name"]] = r["label"]
return result
def set_tag_labels(self, site_id: int, mapping: dict[str, str]) -> None:
"""Bulk INSERT OR REPLACE (name -> label) entries for one site.
Auto-prunes oldest entries when the table exceeds
_TAG_CACHE_MAX_ROWS to prevent unbounded growth.
"""
if not mapping:
return
now = datetime.now(timezone.utc).isoformat()
rows = [(site_id, name, label, now) for name, label in mapping.items()]
with self._write():
self.conn.executemany(
"INSERT OR REPLACE INTO tag_types (site_id, name, label, fetched_at) "
"VALUES (?, ?, ?, ?)",
rows,
)
self._prune_tag_cache()
_TAG_CACHE_MAX_ROWS = 50_000 # ~50k tags ≈ several months of browsing
def _prune_tag_cache(self) -> None:
"""Delete the oldest tag_types rows if the table exceeds the cap.
Keeps the most-recently-fetched entries. Runs inside an
existing _write() context from set_tag_labels, so no extra
transaction overhead. The cap is generous enough that
normal usage never hits it; it's a safety valve for users
who browse dozens of boorus over months without clearing.
"""
count = self.conn.execute("SELECT COUNT(*) FROM tag_types").fetchone()[0]
if count <= self._TAG_CACHE_MAX_ROWS:
return
excess = count - self._TAG_CACHE_MAX_ROWS
self.conn.execute(
"DELETE FROM tag_types WHERE rowid IN ("
" SELECT rowid FROM tag_types ORDER BY fetched_at ASC LIMIT ?"
")",
(excess,),
)
def clear_tag_cache(self, site_id: int | None = None) -> int:
"""Delete cached tag types. Pass site_id to clear one site,
or None to clear all. Returns rows deleted. Exposed for
future Settings UI "Clear tag cache" button."""
with self._write():
if site_id is not None:
cur = self.conn.execute("DELETE FROM tag_types WHERE site_id = ?", (site_id,))
else:
cur = self.conn.execute("DELETE FROM tag_types")
return cur.rowcount
# -- Settings --
def get_setting(self, key: str) -> str: