security: fix #10 — validate media magic in first 16 bytes of stream

The previous flow streamed the full body to disk and called _is_valid_media after completion. A hostile server that omits Content-Type (so the early text/html guard doesn't fire) could burn up to MAX_DOWNLOAD_BYTES (500MB) of bandwidth and cache-dir write/delete churn before the post-download check rejected. Refactors _do_download to accumulate chunks into a small header buffer until at least 16 bytes have arrived, then runs _looks_like_media against the buffer before committing to writing the full payload. The 16-byte minimum handles servers that send tiny chunks (chunked encoding with 1-byte chunks, slow trickle, TCP MSS fragmentation) without false-failing on the first chunk. Extracts _looks_like_media(bytes) as a sibling to _is_valid_media (path) sharing the same magic-byte recognition. _looks_like_media fails closed on empty input — when called from the streaming validator, an empty header means the server returned nothing useful. _is_valid_media keeps its OSError-fallback open behavior for the on-disk path so transient EBUSY doesn't trigger a delete + re-download loop. Audit-Ref: SECURITY_AUDIT.md finding #10 Severity: Low
2026-04-11 16:24:59 -05:00 · 2026-04-11 16:24:59 -05:00 · b65f8da837
commit b65f8da837
parent fef3c237f1
2 changed files with 159 additions and 15 deletions
--- a/booru_viewer/core/cache.py
+++ b/booru_viewer/core/cache.py
@ -122,6 +122,33 @@ _IMAGE_MAGIC = {
    b'PK\x03\x04': True,    # ZIP (ugoira)
 }
 # Header size used by both _looks_like_media (in-memory bytes) and the
 # in-stream early validator in _do_download. 16 bytes covers JPEG (3),
 # PNG (8), GIF (6), WebP (12), MP4/MOV (8), WebM/MKV (4), and ZIP (4)
 # magics with comfortable margin.
 _MEDIA_HEADER_MIN = 16
 def _looks_like_media(header: bytes) -> bool:
    """Return True if the leading bytes match a known media magic.
    Conservative on the empty case: an empty header is "unknown",
    not "valid", because the streaming validator (audit #10) calls us
    before any bytes have arrived means the server returned nothing
    useful. The on-disk validator wraps this with an OSError fallback
    that returns True instead — see _is_valid_media.
    """
    if not header:
        return False
    if header.startswith(b'<') or header.startswith(b'<!'):
        return False
    for magic in _IMAGE_MAGIC:
        if header.startswith(magic):
            return True
    # Not a known magic and not HTML: treat as ok (some boorus serve
    # exotic-but-legal containers we don't enumerate above).
    return b'<html' not in header.lower() and b'<!doctype' not in header.lower()
 def _is_valid_media(path: Path) -> bool:
    """Check if a file looks like actual media, not an HTML error page.
@ -133,18 +160,11 @@ def _is_valid_media(path: Path) -> bool:
    """
    try:
        with open(path, "rb") as f:
-            header = f.read(16)
+            header = f.read(_MEDIA_HEADER_MIN)
    except OSError as e:
        log.warning("Cannot read %s for validation (%s); treating as valid", path, e)
        return True
-    if not header or header.startswith(b'<') or header.startswith(b'<!'):
+    return _looks_like_media(header)
        return False
    # Check for known magic bytes
    for magic in _IMAGE_MAGIC:
        if header.startswith(magic):
            return True
    # If not a known type but not HTML, assume it's ok
    return b'<html' not in header.lower() and b'<!doctype' not in header.lower()
 def _ext_from_url(url: str) -> str:
@ -429,7 +449,30 @@ async def _do_download(
                f"Download too large: {total} bytes (cap {MAX_DOWNLOAD_BYTES})"
            )
-        if total >= STREAM_TO_DISK_THRESHOLD:
+        # Audit #10: accumulate the leading bytes (≥16) before
        # committing to writing the rest. A hostile server that omits
        # Content-Type and ignores the HTML check could otherwise
        # stream up to MAX_DOWNLOAD_BYTES of garbage to disk before
        # the post-download _is_valid_media check rejects and deletes
        # it. We accumulate across chunks because slow servers (or
        # chunked encoding with tiny chunks) can deliver fewer than
        # 16 bytes in the first chunk and validation would false-fail.
        use_large = total >= STREAM_TO_DISK_THRESHOLD
        chunk_iter = resp.aiter_bytes(64 * 1024 if use_large else 8192)
        header_buf = bytearray()
        async for chunk in chunk_iter:
            header_buf.extend(chunk)
            if len(header_buf) >= _MEDIA_HEADER_MIN:
                break
        if len(header_buf) > MAX_DOWNLOAD_BYTES:
            raise ValueError(
                f"Download exceeded cap mid-stream: {len(header_buf)} bytes"
            )
        if not _looks_like_media(bytes(header_buf)):
            raise ValueError("Downloaded data is not valid media")
        if use_large:
            # Large download: stream to tempfile in the same dir, atomic replace.
            local.parent.mkdir(parents=True, exist_ok=True)
            fd, tmp_name = tempfile.mkstemp(
@ -437,9 +480,12 @@ async def _do_download(
            )
            tmp_path = Path(tmp_name)
            try:
-                downloaded = 0
+                downloaded = len(header_buf)
                with os.fdopen(fd, "wb") as out:
-                    async for chunk in resp.aiter_bytes(64 * 1024):
+                    out.write(header_buf)
                    if progress_callback:
                        progress_callback(downloaded, total)
                    async for chunk in chunk_iter:
                        out.write(chunk)
                        downloaded += len(chunk)
                        if downloaded > MAX_DOWNLOAD_BYTES:
@ -457,9 +503,11 @@ async def _do_download(
                raise
        else:
            # Small/unknown size: buffer in memory, write whole.
-            chunks: list[bytes] = []
+            chunks: list[bytes] = [bytes(header_buf)]
-            downloaded = 0
+            downloaded = len(header_buf)
-            async for chunk in resp.aiter_bytes(8192):
+            if progress_callback:
                progress_callback(downloaded, total)
            async for chunk in chunk_iter:
                chunks.append(chunk)
                downloaded += len(chunk)
                if downloaded > MAX_DOWNLOAD_BYTES:
--- a/tests/core/test_cache.py
+++ b/tests/core/test_cache.py
@ -212,6 +212,102 @@ def test_download_cap_running_total_aborts(tmp_path, monkeypatch):
    assert not local.exists()
 # -- _looks_like_media (audit finding #10) --
 def test_looks_like_media_jpeg_magic_recognised():
    from booru_viewer.core.cache import _looks_like_media
    assert _looks_like_media(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01") is True
 def test_looks_like_media_png_magic_recognised():
    from booru_viewer.core.cache import _looks_like_media
    assert _looks_like_media(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") is True
 def test_looks_like_media_webm_magic_recognised():
    from booru_viewer.core.cache import _looks_like_media
    # EBML header (Matroska/WebM): 1A 45 DF A3
    assert _looks_like_media(b"\x1aE\xdf\xa3" + b"\x00" * 20) is True
 def test_looks_like_media_html_rejected():
    from booru_viewer.core.cache import _looks_like_media
    assert _looks_like_media(b"<!doctype html><html><body>") is False
    assert _looks_like_media(b"<html><head>") is False
 def test_looks_like_media_empty_rejected():
    """An empty buffer means the server returned nothing useful — fail
    closed (rather than the on-disk validator's open-on-error fallback)."""
    from booru_viewer.core.cache import _looks_like_media
    assert _looks_like_media(b"") is False
 def test_looks_like_media_unknown_magic_accepted():
    """Non-HTML, non-magic bytes are conservative-OK — some boorus
    serve exotic-but-legal containers we don't enumerate."""
    from booru_viewer.core.cache import _looks_like_media
    assert _looks_like_media(b"random non-html data    ") is True
 # -- _do_download early header validation (audit finding #10) --
 def test_do_download_early_rejects_html_payload(tmp_path):
    """A hostile server that returns HTML in the body (omitting
    Content-Type so the early text/html guard doesn't fire) must be
    caught by the magic-byte check before any bytes land on disk.
    Audit finding #10: this used to wait for the full download to
    complete before _is_valid_media rejected, wasting bandwidth."""
    response = _FakeResponse(
        headers={"content-length": "0"},  # no Content-Type, no length
        chunks=[b"<!doctype html><html><body>500</body></html>"],
    )
    client = _FakeClient(response)
    local = tmp_path / "out.jpg"
    with pytest.raises(ValueError, match="not valid media"):
        asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
    assert not local.exists()
 def test_do_download_early_rejects_html_across_tiny_chunks(tmp_path):
    """The accumulator must combine chunks smaller than the 16-byte
    minimum so a server delivering one byte at a time can't slip
    past the magic-byte check."""
    response = _FakeResponse(
        headers={"content-length": "0"},
        chunks=[b"<!", b"do", b"ct", b"yp", b"e ", b"ht", b"ml", b">", b"x" * 100],
    )
    client = _FakeClient(response)
    local = tmp_path / "out.jpg"
    with pytest.raises(ValueError, match="not valid media"):
        asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
    assert not local.exists()
 def test_do_download_writes_valid_jpeg_after_early_validation(tmp_path):
    """A real JPEG-like header passes the early check and the rest
    of the stream is written through to disk. Header bytes must
    appear in the final file (not be silently dropped)."""
    body = b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01" + b"PAYLOAD" + b"\xff\xd9"
    response = _FakeResponse(
        headers={"content-length": str(len(body)), "content-type": "image/jpeg"},
        chunks=[body[:8], body[8:]],  # split mid-magic
    )
    client = _FakeClient(response)
    local = tmp_path / "out.jpg"
    asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
    assert local.exists()
    assert local.read_bytes() == body
 # -- _is_valid_media OSError fallback --
 def test_is_valid_media_returns_true_on_oserror(tmp_path):