security: fix #5 — LRU cap on _url_locks to prevent memory leak
Replaces the unbounded defaultdict(asyncio.Lock) with an OrderedDict guarded by _get_url_lock() and _evict_url_locks(). The cap is 4096 entries; LRU semantics keep the hot working set alive and oldest- unlocked-first eviction trims back toward the cap on each new insertion. Eviction skips locks that are currently held — popping a lock that a coroutine is mid-`async with` on would break its __aexit__. The inner loop's evicted-flag handles the edge case where every remaining entry is either the freshly inserted hash or held; in that state the cap is briefly exceeded and the next insertion retries, instead of looping forever. Audit-Ref: SECURITY_AUDIT.md finding #5 Severity: Medium
This commit is contained in:
parent
a6a73fed61
commit
5d348fa8be
@ -9,7 +9,7 @@ import os
|
||||
import tempfile
|
||||
import threading
|
||||
import zipfile
|
||||
from collections import OrderedDict, defaultdict
|
||||
from collections import OrderedDict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
@ -276,7 +276,59 @@ def _referer_for(parsed) -> str:
|
||||
# does the actual download; the other waits and reads the cached file.
|
||||
# Loop-bound, but the existing module is already loop-bound, so this
|
||||
# doesn't make anything worse and is fixed cleanly in PR2.
|
||||
_url_locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
|
||||
#
|
||||
# Capped at _URL_LOCKS_MAX entries (audit finding #5). The previous
|
||||
# defaultdict grew unbounded over a long browsing session, and an
|
||||
# adversarial booru returning cache-buster query strings could turn
|
||||
# the leak into an OOM DoS.
|
||||
_URL_LOCKS_MAX = 4096
|
||||
_url_locks: "OrderedDict[str, asyncio.Lock]" = OrderedDict()
|
||||
|
||||
|
||||
def _get_url_lock(h: str) -> asyncio.Lock:
|
||||
"""Return the asyncio.Lock for URL hash *h*, creating it if needed.
|
||||
|
||||
Touches LRU order on every call so frequently-accessed hashes
|
||||
survive eviction. The first call for a new hash inserts it and
|
||||
triggers _evict_url_locks() to trim back toward the cap.
|
||||
"""
|
||||
lock = _url_locks.get(h)
|
||||
if lock is None:
|
||||
lock = asyncio.Lock()
|
||||
_url_locks[h] = lock
|
||||
_evict_url_locks(skip=h)
|
||||
else:
|
||||
_url_locks.move_to_end(h)
|
||||
return lock
|
||||
|
||||
|
||||
def _evict_url_locks(skip: str) -> None:
|
||||
"""Trim _url_locks back toward _URL_LOCKS_MAX, oldest first.
|
||||
|
||||
Each pass skips:
|
||||
- the hash *skip* we just inserted (it's the youngest — evicting
|
||||
it immediately would be self-defeating), and
|
||||
- any entry whose lock is currently held (we can't drop a lock
|
||||
that a coroutine is mid-`async with` on without that coroutine
|
||||
blowing up on exit).
|
||||
|
||||
Stops as soon as one pass finds no evictable entries — that
|
||||
handles the edge case where every remaining entry is either
|
||||
*skip* or currently held. In that state the cap is temporarily
|
||||
exceeded; the next insertion will retry eviction.
|
||||
"""
|
||||
while len(_url_locks) > _URL_LOCKS_MAX:
|
||||
evicted = False
|
||||
for old_h in list(_url_locks.keys()):
|
||||
if old_h == skip:
|
||||
continue
|
||||
if _url_locks[old_h].locked():
|
||||
continue
|
||||
_url_locks.pop(old_h, None)
|
||||
evicted = True
|
||||
break
|
||||
if not evicted:
|
||||
return
|
||||
|
||||
|
||||
async def download_image(
|
||||
@ -293,7 +345,7 @@ async def download_image(
|
||||
filename = _url_hash(url) + _ext_from_url(url)
|
||||
local = dest_dir / filename
|
||||
|
||||
async with _url_locks[_url_hash(url)]:
|
||||
async with _get_url_lock(_url_hash(url)):
|
||||
# Check if a ugoira zip was already converted to gif
|
||||
if local.suffix.lower() == ".zip":
|
||||
gif_path = local.with_suffix(".gif")
|
||||
|
||||
@ -222,3 +222,71 @@ def test_is_valid_media_returns_true_on_oserror(tmp_path):
|
||||
OS issue persisted."""
|
||||
nonexistent = tmp_path / "definitely-not-here.jpg"
|
||||
assert _is_valid_media(nonexistent) is True
|
||||
|
||||
|
||||
# -- _url_locks LRU cap (audit finding #5) --
|
||||
|
||||
def test_url_locks_capped_at_max():
|
||||
"""The per-URL coalesce lock table must not grow beyond _URL_LOCKS_MAX
|
||||
entries. Without the cap, a long browsing session or an adversarial
|
||||
booru returning cache-buster query strings would leak one Lock per
|
||||
unique URL until OOM."""
|
||||
cache._url_locks.clear()
|
||||
try:
|
||||
for i in range(cache._URL_LOCKS_MAX + 500):
|
||||
cache._get_url_lock(f"hash{i}")
|
||||
assert len(cache._url_locks) <= cache._URL_LOCKS_MAX
|
||||
finally:
|
||||
cache._url_locks.clear()
|
||||
|
||||
|
||||
def test_url_locks_returns_same_lock_for_same_hash():
|
||||
"""Two get_url_lock calls with the same hash must return the same
|
||||
Lock object — that's the whole point of the coalesce table."""
|
||||
cache._url_locks.clear()
|
||||
try:
|
||||
lock_a = cache._get_url_lock("hashA")
|
||||
lock_b = cache._get_url_lock("hashA")
|
||||
assert lock_a is lock_b
|
||||
finally:
|
||||
cache._url_locks.clear()
|
||||
|
||||
|
||||
def test_url_locks_lru_keeps_recently_used():
|
||||
"""LRU semantics: a hash that gets re-touched moves to the end of
|
||||
the OrderedDict and is the youngest, so eviction picks an older
|
||||
entry instead."""
|
||||
cache._url_locks.clear()
|
||||
try:
|
||||
cache._get_url_lock("oldest")
|
||||
cache._get_url_lock("middle")
|
||||
cache._get_url_lock("oldest") # touch — now youngest
|
||||
# The dict should now be: middle, oldest (insertion order with
|
||||
# move_to_end on the touch).
|
||||
keys = list(cache._url_locks.keys())
|
||||
assert keys == ["middle", "oldest"]
|
||||
finally:
|
||||
cache._url_locks.clear()
|
||||
|
||||
|
||||
def test_url_locks_eviction_skips_held_locks():
|
||||
"""A held lock (one a coroutine is mid-`async with` on) must NOT be
|
||||
evicted; popping it would break the coroutine's __aexit__. The
|
||||
eviction loop sees `lock.locked()` and skips it."""
|
||||
cache._url_locks.clear()
|
||||
try:
|
||||
# Seed an entry and hold it.
|
||||
held = cache._get_url_lock("held_hash")
|
||||
|
||||
async def hold_and_fill():
|
||||
async with held:
|
||||
# While we're holding the lock, force eviction by
|
||||
# filling past the cap.
|
||||
for i in range(cache._URL_LOCKS_MAX + 100):
|
||||
cache._get_url_lock(f"new{i}")
|
||||
# The held lock must still be present.
|
||||
assert "held_hash" in cache._url_locks
|
||||
|
||||
asyncio.run(hold_and_fill())
|
||||
finally:
|
||||
cache._url_locks.clear()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user