security: fix #5 — LRU cap on _url_locks to prevent memory leak

Replaces the unbounded defaultdict(asyncio.Lock) with an OrderedDict
guarded by _get_url_lock() and _evict_url_locks(). The cap is 4096
entries; LRU semantics keep the hot working set alive and oldest-
unlocked-first eviction trims back toward the cap on each new
insertion.

Eviction skips locks that are currently held — popping a lock that
a coroutine is mid-`async with` on would break its __aexit__. The
inner loop's evicted-flag handles the edge case where every
remaining entry is either the freshly inserted hash or held; in
that state the cap is briefly exceeded and the next insertion
retries, instead of looping forever.

Audit-Ref: SECURITY_AUDIT.md finding #5
Severity: Medium
This commit is contained in:
pax 2026-04-11 16:16:52 -05:00
parent a6a73fed61
commit 5d348fa8be
2 changed files with 123 additions and 3 deletions

View File

@ -9,7 +9,7 @@ import os
import tempfile
import threading
import zipfile
from collections import OrderedDict, defaultdict
from collections import OrderedDict
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse
@ -276,7 +276,59 @@ def _referer_for(parsed) -> str:
# does the actual download; the other waits and reads the cached file.
# Loop-bound, but the existing module is already loop-bound, so this
# doesn't make anything worse and is fixed cleanly in PR2.
_url_locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
#
# Capped at _URL_LOCKS_MAX entries (audit finding #5). The previous
# defaultdict grew unbounded over a long browsing session, and an
# adversarial booru returning cache-buster query strings could turn
# the leak into an OOM DoS.
_URL_LOCKS_MAX = 4096
_url_locks: "OrderedDict[str, asyncio.Lock]" = OrderedDict()
def _get_url_lock(h: str) -> asyncio.Lock:
"""Return the asyncio.Lock for URL hash *h*, creating it if needed.
Touches LRU order on every call so frequently-accessed hashes
survive eviction. The first call for a new hash inserts it and
triggers _evict_url_locks() to trim back toward the cap.
"""
lock = _url_locks.get(h)
if lock is None:
lock = asyncio.Lock()
_url_locks[h] = lock
_evict_url_locks(skip=h)
else:
_url_locks.move_to_end(h)
return lock
def _evict_url_locks(skip: str) -> None:
"""Trim _url_locks back toward _URL_LOCKS_MAX, oldest first.
Each pass skips:
- the hash *skip* we just inserted (it's the youngest — evicting
it immediately would be self-defeating), and
- any entry whose lock is currently held (we can't drop a lock
that a coroutine is mid-`async with` on without that coroutine
blowing up on exit).
Stops as soon as one pass finds no evictable entries that
handles the edge case where every remaining entry is either
*skip* or currently held. In that state the cap is temporarily
exceeded; the next insertion will retry eviction.
"""
while len(_url_locks) > _URL_LOCKS_MAX:
evicted = False
for old_h in list(_url_locks.keys()):
if old_h == skip:
continue
if _url_locks[old_h].locked():
continue
_url_locks.pop(old_h, None)
evicted = True
break
if not evicted:
return
async def download_image(
@ -293,7 +345,7 @@ async def download_image(
filename = _url_hash(url) + _ext_from_url(url)
local = dest_dir / filename
async with _url_locks[_url_hash(url)]:
async with _get_url_lock(_url_hash(url)):
# Check if a ugoira zip was already converted to gif
if local.suffix.lower() == ".zip":
gif_path = local.with_suffix(".gif")

View File

@ -222,3 +222,71 @@ def test_is_valid_media_returns_true_on_oserror(tmp_path):
OS issue persisted."""
nonexistent = tmp_path / "definitely-not-here.jpg"
assert _is_valid_media(nonexistent) is True
# -- _url_locks LRU cap (audit finding #5) --
def test_url_locks_capped_at_max():
"""The per-URL coalesce lock table must not grow beyond _URL_LOCKS_MAX
entries. Without the cap, a long browsing session or an adversarial
booru returning cache-buster query strings would leak one Lock per
unique URL until OOM."""
cache._url_locks.clear()
try:
for i in range(cache._URL_LOCKS_MAX + 500):
cache._get_url_lock(f"hash{i}")
assert len(cache._url_locks) <= cache._URL_LOCKS_MAX
finally:
cache._url_locks.clear()
def test_url_locks_returns_same_lock_for_same_hash():
"""Two get_url_lock calls with the same hash must return the same
Lock object that's the whole point of the coalesce table."""
cache._url_locks.clear()
try:
lock_a = cache._get_url_lock("hashA")
lock_b = cache._get_url_lock("hashA")
assert lock_a is lock_b
finally:
cache._url_locks.clear()
def test_url_locks_lru_keeps_recently_used():
"""LRU semantics: a hash that gets re-touched moves to the end of
the OrderedDict and is the youngest, so eviction picks an older
entry instead."""
cache._url_locks.clear()
try:
cache._get_url_lock("oldest")
cache._get_url_lock("middle")
cache._get_url_lock("oldest") # touch — now youngest
# The dict should now be: middle, oldest (insertion order with
# move_to_end on the touch).
keys = list(cache._url_locks.keys())
assert keys == ["middle", "oldest"]
finally:
cache._url_locks.clear()
def test_url_locks_eviction_skips_held_locks():
"""A held lock (one a coroutine is mid-`async with` on) must NOT be
evicted; popping it would break the coroutine's __aexit__. The
eviction loop sees `lock.locked()` and skips it."""
cache._url_locks.clear()
try:
# Seed an entry and hold it.
held = cache._get_url_lock("held_hash")
async def hold_and_fill():
async with held:
# While we're holding the lock, force eviction by
# filling past the cap.
for i in range(cache._URL_LOCKS_MAX + 100):
cache._get_url_lock(f"new{i}")
# The held lock must still be present.
assert "held_hash" in cache._url_locks
asyncio.run(hold_and_fill())
finally:
cache._url_locks.clear()