From 5976a81bb640db90dd9532ed79da9229623827a8 Mon Sep 17 00:00:00 2001 From: pax Date: Thu, 9 Apr 2026 17:32:21 -0500 Subject: [PATCH] db: add reconcile_library_meta to clean up orphan meta rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old delete_from_library deleted files from disk but never cleaned up the matching library_meta row. Result: pathologically the meta table can have many more rows than there are files on disk. This was harmless when the only consumer was tag-search (the meta would just match nothing useful), but it becomes a real problem the moment is_post_in_library / get_saved_post_ids start driving UI state — the saved-dot indicator would light up for posts whose files have been gone for ages. reconcile_library_meta() walks saved_dir() shallowly (root + one level of subdirs), collects every present post_id (digit-stem files plus templated filenames looked up via library_meta.filename), and DELETEs every meta row whose post_id isn't in that set. Returns the count of removed rows. Defensive: if saved_dir() exists but has zero files (e.g. removable drive temporarily unmounted), the method refuses to reconcile and returns 0. The cost of a false positive — wiping every meta row for a perfectly intact library — is higher than the cost of leaving stale rows around for one more session. The cache.py fix in the next commit makes future delete_from_library calls clean up after themselves. This method is the one-time catch-up for libraries that were already polluted before that fix. --- booru_viewer/core/db.py | 70 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/booru_viewer/core/db.py b/booru_viewer/core/db.py index 8ee5eb1..27af201 100644 --- a/booru_viewer/core/db.py +++ b/booru_viewer/core/db.py @@ -583,6 +583,76 @@ class Database: datetime.now(timezone.utc).isoformat(), filename), ) + def reconcile_library_meta(self) -> int: + """Drop library_meta rows whose files are no longer on disk. + + Walks every row, checks for both digit-stem (legacy v0.2.3) + and templated (post-refactor) filenames in saved_dir() + one + level of subdirectories, and deletes rows where neither is + found. Returns the number of rows removed. + + Cleans up the orphan rows that were leaked by the old + delete_from_library before it learned to clean up after + itself. Safe to call repeatedly — a no-op once the DB is + consistent with disk. + + Skips reconciliation entirely if saved_dir() is missing or + empty (defensive — a removable drive temporarily unmounted + shouldn't trigger a wholesale meta wipe). + """ + from .config import saved_dir, MEDIA_EXTENSIONS + sd = saved_dir() + if not sd.is_dir(): + return 0 + + # Build the set of (post_id present on disk). Walks shallow: + # root + one level of subdirectories. + on_disk_files: list[Path] = [] + for entry in sd.iterdir(): + if entry.is_file() and entry.suffix.lower() in MEDIA_EXTENSIONS: + on_disk_files.append(entry) + elif entry.is_dir(): + for sub in entry.iterdir(): + if sub.is_file() and sub.suffix.lower() in MEDIA_EXTENSIONS: + on_disk_files.append(sub) + if not on_disk_files: + # No files at all — refuse to reconcile. Could be an + # unmounted drive, a freshly-cleared library, etc. The + # cost of a false positive (wiping every meta row) is + # higher than the cost of leaving stale rows. + return 0 + + present_post_ids: set[int] = set() + for f in on_disk_files: + if f.stem.isdigit(): + present_post_ids.add(int(f.stem)) + # Templated files: look up by filename + for f in on_disk_files: + if not f.stem.isdigit(): + row = self.conn.execute( + "SELECT post_id FROM library_meta WHERE filename = ? LIMIT 1", + (f.name,), + ).fetchone() + if row is not None: + present_post_ids.add(row["post_id"]) + + all_meta_ids = self.get_saved_post_ids() + stale = all_meta_ids - present_post_ids + if not stale: + return 0 + + with self._write(): + BATCH = 500 + stale_list = list(stale) + for i in range(0, len(stale_list), BATCH): + chunk = stale_list[i:i + BATCH] + placeholders = ",".join("?" * len(chunk)) + self.conn.execute( + f"DELETE FROM library_meta WHERE post_id IN ({placeholders})", + chunk, + ) + return len(stale) + def is_post_in_library(self, post_id: int) -> bool: """True iff a `library_meta` row exists for `post_id`.