security: fix #10 — validate media magic in first 16 bytes of stream

The previous flow streamed the full body to disk and called
_is_valid_media after completion. A hostile server that omits
Content-Type (so the early text/html guard doesn't fire) could
burn up to MAX_DOWNLOAD_BYTES (500MB) of bandwidth and cache-dir
write/delete churn before the post-download check rejected.

Refactors _do_download to accumulate chunks into a small header
buffer until at least 16 bytes have arrived, then runs
_looks_like_media against the buffer before committing to writing
the full payload. The 16-byte minimum handles servers that send
tiny chunks (chunked encoding with 1-byte chunks, slow trickle,
TCP MSS fragmentation) without false-failing on the first chunk.

Extracts _looks_like_media(bytes) as a sibling to _is_valid_media
(path) sharing the same magic-byte recognition. _looks_like_media
fails closed on empty input — when called from the streaming
validator, an empty header means the server returned nothing
useful. _is_valid_media keeps its OSError-fallback open behavior
for the on-disk path so transient EBUSY doesn't trigger a delete
+ re-download loop.

Audit-Ref: SECURITY_AUDIT.md finding #10
Severity: Low
This commit is contained in:
pax 2026-04-11 16:24:59 -05:00
parent fef3c237f1
commit b65f8da837
2 changed files with 159 additions and 15 deletions

View File

@ -122,6 +122,33 @@ _IMAGE_MAGIC = {
b'PK\x03\x04': True, # ZIP (ugoira) b'PK\x03\x04': True, # ZIP (ugoira)
} }
# Header size used by both _looks_like_media (in-memory bytes) and the
# in-stream early validator in _do_download. 16 bytes covers JPEG (3),
# PNG (8), GIF (6), WebP (12), MP4/MOV (8), WebM/MKV (4), and ZIP (4)
# magics with comfortable margin.
_MEDIA_HEADER_MIN = 16
def _looks_like_media(header: bytes) -> bool:
"""Return True if the leading bytes match a known media magic.
Conservative on the empty case: an empty header is "unknown",
not "valid", because the streaming validator (audit #10) calls us
before any bytes have arrived means the server returned nothing
useful. The on-disk validator wraps this with an OSError fallback
that returns True instead see _is_valid_media.
"""
if not header:
return False
if header.startswith(b'<') or header.startswith(b'<!'):
return False
for magic in _IMAGE_MAGIC:
if header.startswith(magic):
return True
# Not a known magic and not HTML: treat as ok (some boorus serve
# exotic-but-legal containers we don't enumerate above).
return b'<html' not in header.lower() and b'<!doctype' not in header.lower()
def _is_valid_media(path: Path) -> bool: def _is_valid_media(path: Path) -> bool:
"""Check if a file looks like actual media, not an HTML error page. """Check if a file looks like actual media, not an HTML error page.
@ -133,18 +160,11 @@ def _is_valid_media(path: Path) -> bool:
""" """
try: try:
with open(path, "rb") as f: with open(path, "rb") as f:
header = f.read(16) header = f.read(_MEDIA_HEADER_MIN)
except OSError as e: except OSError as e:
log.warning("Cannot read %s for validation (%s); treating as valid", path, e) log.warning("Cannot read %s for validation (%s); treating as valid", path, e)
return True return True
if not header or header.startswith(b'<') or header.startswith(b'<!'): return _looks_like_media(header)
return False
# Check for known magic bytes
for magic in _IMAGE_MAGIC:
if header.startswith(magic):
return True
# If not a known type but not HTML, assume it's ok
return b'<html' not in header.lower() and b'<!doctype' not in header.lower()
def _ext_from_url(url: str) -> str: def _ext_from_url(url: str) -> str:
@ -429,7 +449,30 @@ async def _do_download(
f"Download too large: {total} bytes (cap {MAX_DOWNLOAD_BYTES})" f"Download too large: {total} bytes (cap {MAX_DOWNLOAD_BYTES})"
) )
if total >= STREAM_TO_DISK_THRESHOLD: # Audit #10: accumulate the leading bytes (≥16) before
# committing to writing the rest. A hostile server that omits
# Content-Type and ignores the HTML check could otherwise
# stream up to MAX_DOWNLOAD_BYTES of garbage to disk before
# the post-download _is_valid_media check rejects and deletes
# it. We accumulate across chunks because slow servers (or
# chunked encoding with tiny chunks) can deliver fewer than
# 16 bytes in the first chunk and validation would false-fail.
use_large = total >= STREAM_TO_DISK_THRESHOLD
chunk_iter = resp.aiter_bytes(64 * 1024 if use_large else 8192)
header_buf = bytearray()
async for chunk in chunk_iter:
header_buf.extend(chunk)
if len(header_buf) >= _MEDIA_HEADER_MIN:
break
if len(header_buf) > MAX_DOWNLOAD_BYTES:
raise ValueError(
f"Download exceeded cap mid-stream: {len(header_buf)} bytes"
)
if not _looks_like_media(bytes(header_buf)):
raise ValueError("Downloaded data is not valid media")
if use_large:
# Large download: stream to tempfile in the same dir, atomic replace. # Large download: stream to tempfile in the same dir, atomic replace.
local.parent.mkdir(parents=True, exist_ok=True) local.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_name = tempfile.mkstemp( fd, tmp_name = tempfile.mkstemp(
@ -437,9 +480,12 @@ async def _do_download(
) )
tmp_path = Path(tmp_name) tmp_path = Path(tmp_name)
try: try:
downloaded = 0 downloaded = len(header_buf)
with os.fdopen(fd, "wb") as out: with os.fdopen(fd, "wb") as out:
async for chunk in resp.aiter_bytes(64 * 1024): out.write(header_buf)
if progress_callback:
progress_callback(downloaded, total)
async for chunk in chunk_iter:
out.write(chunk) out.write(chunk)
downloaded += len(chunk) downloaded += len(chunk)
if downloaded > MAX_DOWNLOAD_BYTES: if downloaded > MAX_DOWNLOAD_BYTES:
@ -457,9 +503,11 @@ async def _do_download(
raise raise
else: else:
# Small/unknown size: buffer in memory, write whole. # Small/unknown size: buffer in memory, write whole.
chunks: list[bytes] = [] chunks: list[bytes] = [bytes(header_buf)]
downloaded = 0 downloaded = len(header_buf)
async for chunk in resp.aiter_bytes(8192): if progress_callback:
progress_callback(downloaded, total)
async for chunk in chunk_iter:
chunks.append(chunk) chunks.append(chunk)
downloaded += len(chunk) downloaded += len(chunk)
if downloaded > MAX_DOWNLOAD_BYTES: if downloaded > MAX_DOWNLOAD_BYTES:

View File

@ -212,6 +212,102 @@ def test_download_cap_running_total_aborts(tmp_path, monkeypatch):
assert not local.exists() assert not local.exists()
# -- _looks_like_media (audit finding #10) --
def test_looks_like_media_jpeg_magic_recognised():
from booru_viewer.core.cache import _looks_like_media
assert _looks_like_media(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01") is True
def test_looks_like_media_png_magic_recognised():
from booru_viewer.core.cache import _looks_like_media
assert _looks_like_media(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") is True
def test_looks_like_media_webm_magic_recognised():
from booru_viewer.core.cache import _looks_like_media
# EBML header (Matroska/WebM): 1A 45 DF A3
assert _looks_like_media(b"\x1aE\xdf\xa3" + b"\x00" * 20) is True
def test_looks_like_media_html_rejected():
from booru_viewer.core.cache import _looks_like_media
assert _looks_like_media(b"<!doctype html><html><body>") is False
assert _looks_like_media(b"<html><head>") is False
def test_looks_like_media_empty_rejected():
"""An empty buffer means the server returned nothing useful — fail
closed (rather than the on-disk validator's open-on-error fallback)."""
from booru_viewer.core.cache import _looks_like_media
assert _looks_like_media(b"") is False
def test_looks_like_media_unknown_magic_accepted():
"""Non-HTML, non-magic bytes are conservative-OK — some boorus
serve exotic-but-legal containers we don't enumerate."""
from booru_viewer.core.cache import _looks_like_media
assert _looks_like_media(b"random non-html data ") is True
# -- _do_download early header validation (audit finding #10) --
def test_do_download_early_rejects_html_payload(tmp_path):
"""A hostile server that returns HTML in the body (omitting
Content-Type so the early text/html guard doesn't fire) must be
caught by the magic-byte check before any bytes land on disk.
Audit finding #10: this used to wait for the full download to
complete before _is_valid_media rejected, wasting bandwidth."""
response = _FakeResponse(
headers={"content-length": "0"}, # no Content-Type, no length
chunks=[b"<!doctype html><html><body>500</body></html>"],
)
client = _FakeClient(response)
local = tmp_path / "out.jpg"
with pytest.raises(ValueError, match="not valid media"):
asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
assert not local.exists()
def test_do_download_early_rejects_html_across_tiny_chunks(tmp_path):
"""The accumulator must combine chunks smaller than the 16-byte
minimum so a server delivering one byte at a time can't slip
past the magic-byte check."""
response = _FakeResponse(
headers={"content-length": "0"},
chunks=[b"<!", b"do", b"ct", b"yp", b"e ", b"ht", b"ml", b">", b"x" * 100],
)
client = _FakeClient(response)
local = tmp_path / "out.jpg"
with pytest.raises(ValueError, match="not valid media"):
asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
assert not local.exists()
def test_do_download_writes_valid_jpeg_after_early_validation(tmp_path):
"""A real JPEG-like header passes the early check and the rest
of the stream is written through to disk. Header bytes must
appear in the final file (not be silently dropped)."""
body = b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01" + b"PAYLOAD" + b"\xff\xd9"
response = _FakeResponse(
headers={"content-length": str(len(body)), "content-type": "image/jpeg"},
chunks=[body[:8], body[8:]], # split mid-magic
)
client = _FakeClient(response)
local = tmp_path / "out.jpg"
asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
assert local.exists()
assert local.read_bytes() == body
# -- _is_valid_media OSError fallback -- # -- _is_valid_media OSError fallback --
def test_is_valid_media_returns_true_on_oserror(tmp_path): def test_is_valid_media_returns_true_on_oserror(tmp_path):