security: fix #5 — LRU cap on _url_locks to prevent memory leak
Replaces the unbounded defaultdict(asyncio.Lock) with an OrderedDict guarded by _get_url_lock() and _evict_url_locks(). The cap is 4096 entries; LRU semantics keep the hot working set alive and oldest- unlocked-first eviction trims back toward the cap on each new insertion. Eviction skips locks that are currently held — popping a lock that a coroutine is mid-`async with` on would break its __aexit__. The inner loop's evicted-flag handles the edge case where every remaining entry is either the freshly inserted hash or held; in that state the cap is briefly exceeded and the next insertion retries, instead of looping forever. Audit-Ref: SECURITY_AUDIT.md finding #5 Severity: Medium
This commit is contained in:
parent
a6a73fed61
commit
5d348fa8be
@ -9,7 +9,7 @@ import os
|
|||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import zipfile
|
import zipfile
|
||||||
from collections import OrderedDict, defaultdict
|
from collections import OrderedDict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@ -276,7 +276,59 @@ def _referer_for(parsed) -> str:
|
|||||||
# does the actual download; the other waits and reads the cached file.
|
# does the actual download; the other waits and reads the cached file.
|
||||||
# Loop-bound, but the existing module is already loop-bound, so this
|
# Loop-bound, but the existing module is already loop-bound, so this
|
||||||
# doesn't make anything worse and is fixed cleanly in PR2.
|
# doesn't make anything worse and is fixed cleanly in PR2.
|
||||||
_url_locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
|
#
|
||||||
|
# Capped at _URL_LOCKS_MAX entries (audit finding #5). The previous
|
||||||
|
# defaultdict grew unbounded over a long browsing session, and an
|
||||||
|
# adversarial booru returning cache-buster query strings could turn
|
||||||
|
# the leak into an OOM DoS.
|
||||||
|
_URL_LOCKS_MAX = 4096
|
||||||
|
_url_locks: "OrderedDict[str, asyncio.Lock]" = OrderedDict()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_url_lock(h: str) -> asyncio.Lock:
|
||||||
|
"""Return the asyncio.Lock for URL hash *h*, creating it if needed.
|
||||||
|
|
||||||
|
Touches LRU order on every call so frequently-accessed hashes
|
||||||
|
survive eviction. The first call for a new hash inserts it and
|
||||||
|
triggers _evict_url_locks() to trim back toward the cap.
|
||||||
|
"""
|
||||||
|
lock = _url_locks.get(h)
|
||||||
|
if lock is None:
|
||||||
|
lock = asyncio.Lock()
|
||||||
|
_url_locks[h] = lock
|
||||||
|
_evict_url_locks(skip=h)
|
||||||
|
else:
|
||||||
|
_url_locks.move_to_end(h)
|
||||||
|
return lock
|
||||||
|
|
||||||
|
|
||||||
|
def _evict_url_locks(skip: str) -> None:
|
||||||
|
"""Trim _url_locks back toward _URL_LOCKS_MAX, oldest first.
|
||||||
|
|
||||||
|
Each pass skips:
|
||||||
|
- the hash *skip* we just inserted (it's the youngest — evicting
|
||||||
|
it immediately would be self-defeating), and
|
||||||
|
- any entry whose lock is currently held (we can't drop a lock
|
||||||
|
that a coroutine is mid-`async with` on without that coroutine
|
||||||
|
blowing up on exit).
|
||||||
|
|
||||||
|
Stops as soon as one pass finds no evictable entries — that
|
||||||
|
handles the edge case where every remaining entry is either
|
||||||
|
*skip* or currently held. In that state the cap is temporarily
|
||||||
|
exceeded; the next insertion will retry eviction.
|
||||||
|
"""
|
||||||
|
while len(_url_locks) > _URL_LOCKS_MAX:
|
||||||
|
evicted = False
|
||||||
|
for old_h in list(_url_locks.keys()):
|
||||||
|
if old_h == skip:
|
||||||
|
continue
|
||||||
|
if _url_locks[old_h].locked():
|
||||||
|
continue
|
||||||
|
_url_locks.pop(old_h, None)
|
||||||
|
evicted = True
|
||||||
|
break
|
||||||
|
if not evicted:
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
async def download_image(
|
async def download_image(
|
||||||
@ -293,7 +345,7 @@ async def download_image(
|
|||||||
filename = _url_hash(url) + _ext_from_url(url)
|
filename = _url_hash(url) + _ext_from_url(url)
|
||||||
local = dest_dir / filename
|
local = dest_dir / filename
|
||||||
|
|
||||||
async with _url_locks[_url_hash(url)]:
|
async with _get_url_lock(_url_hash(url)):
|
||||||
# Check if a ugoira zip was already converted to gif
|
# Check if a ugoira zip was already converted to gif
|
||||||
if local.suffix.lower() == ".zip":
|
if local.suffix.lower() == ".zip":
|
||||||
gif_path = local.with_suffix(".gif")
|
gif_path = local.with_suffix(".gif")
|
||||||
|
|||||||
@ -222,3 +222,71 @@ def test_is_valid_media_returns_true_on_oserror(tmp_path):
|
|||||||
OS issue persisted."""
|
OS issue persisted."""
|
||||||
nonexistent = tmp_path / "definitely-not-here.jpg"
|
nonexistent = tmp_path / "definitely-not-here.jpg"
|
||||||
assert _is_valid_media(nonexistent) is True
|
assert _is_valid_media(nonexistent) is True
|
||||||
|
|
||||||
|
|
||||||
|
# -- _url_locks LRU cap (audit finding #5) --
|
||||||
|
|
||||||
|
def test_url_locks_capped_at_max():
|
||||||
|
"""The per-URL coalesce lock table must not grow beyond _URL_LOCKS_MAX
|
||||||
|
entries. Without the cap, a long browsing session or an adversarial
|
||||||
|
booru returning cache-buster query strings would leak one Lock per
|
||||||
|
unique URL until OOM."""
|
||||||
|
cache._url_locks.clear()
|
||||||
|
try:
|
||||||
|
for i in range(cache._URL_LOCKS_MAX + 500):
|
||||||
|
cache._get_url_lock(f"hash{i}")
|
||||||
|
assert len(cache._url_locks) <= cache._URL_LOCKS_MAX
|
||||||
|
finally:
|
||||||
|
cache._url_locks.clear()
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_locks_returns_same_lock_for_same_hash():
|
||||||
|
"""Two get_url_lock calls with the same hash must return the same
|
||||||
|
Lock object — that's the whole point of the coalesce table."""
|
||||||
|
cache._url_locks.clear()
|
||||||
|
try:
|
||||||
|
lock_a = cache._get_url_lock("hashA")
|
||||||
|
lock_b = cache._get_url_lock("hashA")
|
||||||
|
assert lock_a is lock_b
|
||||||
|
finally:
|
||||||
|
cache._url_locks.clear()
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_locks_lru_keeps_recently_used():
|
||||||
|
"""LRU semantics: a hash that gets re-touched moves to the end of
|
||||||
|
the OrderedDict and is the youngest, so eviction picks an older
|
||||||
|
entry instead."""
|
||||||
|
cache._url_locks.clear()
|
||||||
|
try:
|
||||||
|
cache._get_url_lock("oldest")
|
||||||
|
cache._get_url_lock("middle")
|
||||||
|
cache._get_url_lock("oldest") # touch — now youngest
|
||||||
|
# The dict should now be: middle, oldest (insertion order with
|
||||||
|
# move_to_end on the touch).
|
||||||
|
keys = list(cache._url_locks.keys())
|
||||||
|
assert keys == ["middle", "oldest"]
|
||||||
|
finally:
|
||||||
|
cache._url_locks.clear()
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_locks_eviction_skips_held_locks():
|
||||||
|
"""A held lock (one a coroutine is mid-`async with` on) must NOT be
|
||||||
|
evicted; popping it would break the coroutine's __aexit__. The
|
||||||
|
eviction loop sees `lock.locked()` and skips it."""
|
||||||
|
cache._url_locks.clear()
|
||||||
|
try:
|
||||||
|
# Seed an entry and hold it.
|
||||||
|
held = cache._get_url_lock("held_hash")
|
||||||
|
|
||||||
|
async def hold_and_fill():
|
||||||
|
async with held:
|
||||||
|
# While we're holding the lock, force eviction by
|
||||||
|
# filling past the cap.
|
||||||
|
for i in range(cache._URL_LOCKS_MAX + 100):
|
||||||
|
cache._get_url_lock(f"new{i}")
|
||||||
|
# The held lock must still be present.
|
||||||
|
assert "held_hash" in cache._url_locks
|
||||||
|
|
||||||
|
asyncio.run(hold_and_fill())
|
||||||
|
finally:
|
||||||
|
cache._url_locks.clear()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user