diff --git a/booru_viewer/core/cache.py b/booru_viewer/core/cache.py index e317435..0636f73 100644 --- a/booru_viewer/core/cache.py +++ b/booru_viewer/core/cache.py @@ -9,7 +9,7 @@ import os import tempfile import threading import zipfile -from collections import OrderedDict, defaultdict +from collections import OrderedDict from datetime import datetime from pathlib import Path from urllib.parse import urlparse @@ -276,7 +276,59 @@ def _referer_for(parsed) -> str: # does the actual download; the other waits and reads the cached file. # Loop-bound, but the existing module is already loop-bound, so this # doesn't make anything worse and is fixed cleanly in PR2. -_url_locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock) +# +# Capped at _URL_LOCKS_MAX entries (audit finding #5). The previous +# defaultdict grew unbounded over a long browsing session, and an +# adversarial booru returning cache-buster query strings could turn +# the leak into an OOM DoS. +_URL_LOCKS_MAX = 4096 +_url_locks: "OrderedDict[str, asyncio.Lock]" = OrderedDict() + + +def _get_url_lock(h: str) -> asyncio.Lock: + """Return the asyncio.Lock for URL hash *h*, creating it if needed. + + Touches LRU order on every call so frequently-accessed hashes + survive eviction. The first call for a new hash inserts it and + triggers _evict_url_locks() to trim back toward the cap. + """ + lock = _url_locks.get(h) + if lock is None: + lock = asyncio.Lock() + _url_locks[h] = lock + _evict_url_locks(skip=h) + else: + _url_locks.move_to_end(h) + return lock + + +def _evict_url_locks(skip: str) -> None: + """Trim _url_locks back toward _URL_LOCKS_MAX, oldest first. + + Each pass skips: + - the hash *skip* we just inserted (it's the youngest — evicting + it immediately would be self-defeating), and + - any entry whose lock is currently held (we can't drop a lock + that a coroutine is mid-`async with` on without that coroutine + blowing up on exit). + + Stops as soon as one pass finds no evictable entries — that + handles the edge case where every remaining entry is either + *skip* or currently held. In that state the cap is temporarily + exceeded; the next insertion will retry eviction. + """ + while len(_url_locks) > _URL_LOCKS_MAX: + evicted = False + for old_h in list(_url_locks.keys()): + if old_h == skip: + continue + if _url_locks[old_h].locked(): + continue + _url_locks.pop(old_h, None) + evicted = True + break + if not evicted: + return async def download_image( @@ -293,7 +345,7 @@ async def download_image( filename = _url_hash(url) + _ext_from_url(url) local = dest_dir / filename - async with _url_locks[_url_hash(url)]: + async with _get_url_lock(_url_hash(url)): # Check if a ugoira zip was already converted to gif if local.suffix.lower() == ".zip": gif_path = local.with_suffix(".gif") diff --git a/tests/core/test_cache.py b/tests/core/test_cache.py index 87393c9..444e924 100644 --- a/tests/core/test_cache.py +++ b/tests/core/test_cache.py @@ -222,3 +222,71 @@ def test_is_valid_media_returns_true_on_oserror(tmp_path): OS issue persisted.""" nonexistent = tmp_path / "definitely-not-here.jpg" assert _is_valid_media(nonexistent) is True + + +# -- _url_locks LRU cap (audit finding #5) -- + +def test_url_locks_capped_at_max(): + """The per-URL coalesce lock table must not grow beyond _URL_LOCKS_MAX + entries. Without the cap, a long browsing session or an adversarial + booru returning cache-buster query strings would leak one Lock per + unique URL until OOM.""" + cache._url_locks.clear() + try: + for i in range(cache._URL_LOCKS_MAX + 500): + cache._get_url_lock(f"hash{i}") + assert len(cache._url_locks) <= cache._URL_LOCKS_MAX + finally: + cache._url_locks.clear() + + +def test_url_locks_returns_same_lock_for_same_hash(): + """Two get_url_lock calls with the same hash must return the same + Lock object — that's the whole point of the coalesce table.""" + cache._url_locks.clear() + try: + lock_a = cache._get_url_lock("hashA") + lock_b = cache._get_url_lock("hashA") + assert lock_a is lock_b + finally: + cache._url_locks.clear() + + +def test_url_locks_lru_keeps_recently_used(): + """LRU semantics: a hash that gets re-touched moves to the end of + the OrderedDict and is the youngest, so eviction picks an older + entry instead.""" + cache._url_locks.clear() + try: + cache._get_url_lock("oldest") + cache._get_url_lock("middle") + cache._get_url_lock("oldest") # touch — now youngest + # The dict should now be: middle, oldest (insertion order with + # move_to_end on the touch). + keys = list(cache._url_locks.keys()) + assert keys == ["middle", "oldest"] + finally: + cache._url_locks.clear() + + +def test_url_locks_eviction_skips_held_locks(): + """A held lock (one a coroutine is mid-`async with` on) must NOT be + evicted; popping it would break the coroutine's __aexit__. The + eviction loop sees `lock.locked()` and skips it.""" + cache._url_locks.clear() + try: + # Seed an entry and hold it. + held = cache._get_url_lock("held_hash") + + async def hold_and_fill(): + async with held: + # While we're holding the lock, force eviction by + # filling past the cap. + for i in range(cache._URL_LOCKS_MAX + 100): + cache._get_url_lock(f"new{i}") + # The held lock must still be present. + assert "held_hash" in cache._url_locks + + asyncio.run(hold_and_fill()) + finally: + cache._url_locks.clear()