diff --git a/booru_viewer/core/cache.py b/booru_viewer/core/cache.py index 9e82ea4..470ede6 100644 --- a/booru_viewer/core/cache.py +++ b/booru_viewer/core/cache.py @@ -122,6 +122,33 @@ _IMAGE_MAGIC = { b'PK\x03\x04': True, # ZIP (ugoira) } +# Header size used by both _looks_like_media (in-memory bytes) and the +# in-stream early validator in _do_download. 16 bytes covers JPEG (3), +# PNG (8), GIF (6), WebP (12), MP4/MOV (8), WebM/MKV (4), and ZIP (4) +# magics with comfortable margin. +_MEDIA_HEADER_MIN = 16 + + +def _looks_like_media(header: bytes) -> bool: + """Return True if the leading bytes match a known media magic. + + Conservative on the empty case: an empty header is "unknown", + not "valid", because the streaming validator (audit #10) calls us + before any bytes have arrived means the server returned nothing + useful. The on-disk validator wraps this with an OSError fallback + that returns True instead — see _is_valid_media. + """ + if not header: + return False + if header.startswith(b'<') or header.startswith(b' bool: """Check if a file looks like actual media, not an HTML error page. @@ -133,18 +160,11 @@ def _is_valid_media(path: Path) -> bool: """ try: with open(path, "rb") as f: - header = f.read(16) + header = f.read(_MEDIA_HEADER_MIN) except OSError as e: log.warning("Cannot read %s for validation (%s); treating as valid", path, e) return True - if not header or header.startswith(b'<') or header.startswith(b' str: @@ -429,7 +449,30 @@ async def _do_download( f"Download too large: {total} bytes (cap {MAX_DOWNLOAD_BYTES})" ) - if total >= STREAM_TO_DISK_THRESHOLD: + # Audit #10: accumulate the leading bytes (≥16) before + # committing to writing the rest. A hostile server that omits + # Content-Type and ignores the HTML check could otherwise + # stream up to MAX_DOWNLOAD_BYTES of garbage to disk before + # the post-download _is_valid_media check rejects and deletes + # it. We accumulate across chunks because slow servers (or + # chunked encoding with tiny chunks) can deliver fewer than + # 16 bytes in the first chunk and validation would false-fail. + use_large = total >= STREAM_TO_DISK_THRESHOLD + chunk_iter = resp.aiter_bytes(64 * 1024 if use_large else 8192) + + header_buf = bytearray() + async for chunk in chunk_iter: + header_buf.extend(chunk) + if len(header_buf) >= _MEDIA_HEADER_MIN: + break + if len(header_buf) > MAX_DOWNLOAD_BYTES: + raise ValueError( + f"Download exceeded cap mid-stream: {len(header_buf)} bytes" + ) + if not _looks_like_media(bytes(header_buf)): + raise ValueError("Downloaded data is not valid media") + + if use_large: # Large download: stream to tempfile in the same dir, atomic replace. local.parent.mkdir(parents=True, exist_ok=True) fd, tmp_name = tempfile.mkstemp( @@ -437,9 +480,12 @@ async def _do_download( ) tmp_path = Path(tmp_name) try: - downloaded = 0 + downloaded = len(header_buf) with os.fdopen(fd, "wb") as out: - async for chunk in resp.aiter_bytes(64 * 1024): + out.write(header_buf) + if progress_callback: + progress_callback(downloaded, total) + async for chunk in chunk_iter: out.write(chunk) downloaded += len(chunk) if downloaded > MAX_DOWNLOAD_BYTES: @@ -457,9 +503,11 @@ async def _do_download( raise else: # Small/unknown size: buffer in memory, write whole. - chunks: list[bytes] = [] - downloaded = 0 - async for chunk in resp.aiter_bytes(8192): + chunks: list[bytes] = [bytes(header_buf)] + downloaded = len(header_buf) + if progress_callback: + progress_callback(downloaded, total) + async for chunk in chunk_iter: chunks.append(chunk) downloaded += len(chunk) if downloaded > MAX_DOWNLOAD_BYTES: diff --git a/tests/core/test_cache.py b/tests/core/test_cache.py index 444e924..5c4a781 100644 --- a/tests/core/test_cache.py +++ b/tests/core/test_cache.py @@ -212,6 +212,102 @@ def test_download_cap_running_total_aborts(tmp_path, monkeypatch): assert not local.exists() +# -- _looks_like_media (audit finding #10) -- + + +def test_looks_like_media_jpeg_magic_recognised(): + from booru_viewer.core.cache import _looks_like_media + assert _looks_like_media(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01") is True + + +def test_looks_like_media_png_magic_recognised(): + from booru_viewer.core.cache import _looks_like_media + assert _looks_like_media(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") is True + + +def test_looks_like_media_webm_magic_recognised(): + from booru_viewer.core.cache import _looks_like_media + # EBML header (Matroska/WebM): 1A 45 DF A3 + assert _looks_like_media(b"\x1aE\xdf\xa3" + b"\x00" * 20) is True + + +def test_looks_like_media_html_rejected(): + from booru_viewer.core.cache import _looks_like_media + assert _looks_like_media(b"") is False + assert _looks_like_media(b"") is False + + +def test_looks_like_media_empty_rejected(): + """An empty buffer means the server returned nothing useful — fail + closed (rather than the on-disk validator's open-on-error fallback).""" + from booru_viewer.core.cache import _looks_like_media + assert _looks_like_media(b"") is False + + +def test_looks_like_media_unknown_magic_accepted(): + """Non-HTML, non-magic bytes are conservative-OK — some boorus + serve exotic-but-legal containers we don't enumerate.""" + from booru_viewer.core.cache import _looks_like_media + assert _looks_like_media(b"random non-html data ") is True + + +# -- _do_download early header validation (audit finding #10) -- + + +def test_do_download_early_rejects_html_payload(tmp_path): + """A hostile server that returns HTML in the body (omitting + Content-Type so the early text/html guard doesn't fire) must be + caught by the magic-byte check before any bytes land on disk. + Audit finding #10: this used to wait for the full download to + complete before _is_valid_media rejected, wasting bandwidth.""" + response = _FakeResponse( + headers={"content-length": "0"}, # no Content-Type, no length + chunks=[b"500"], + ) + client = _FakeClient(response) + local = tmp_path / "out.jpg" + + with pytest.raises(ValueError, match="not valid media"): + asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None)) + + assert not local.exists() + + +def test_do_download_early_rejects_html_across_tiny_chunks(tmp_path): + """The accumulator must combine chunks smaller than the 16-byte + minimum so a server delivering one byte at a time can't slip + past the magic-byte check.""" + response = _FakeResponse( + headers={"content-length": "0"}, + chunks=[b"", b"x" * 100], + ) + client = _FakeClient(response) + local = tmp_path / "out.jpg" + + with pytest.raises(ValueError, match="not valid media"): + asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None)) + + assert not local.exists() + + +def test_do_download_writes_valid_jpeg_after_early_validation(tmp_path): + """A real JPEG-like header passes the early check and the rest + of the stream is written through to disk. Header bytes must + appear in the final file (not be silently dropped).""" + body = b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01" + b"PAYLOAD" + b"\xff\xd9" + response = _FakeResponse( + headers={"content-length": str(len(body)), "content-type": "image/jpeg"}, + chunks=[body[:8], body[8:]], # split mid-magic + ) + client = _FakeClient(response) + local = tmp_path / "out.jpg" + + asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None)) + + assert local.exists() + assert local.read_bytes() == body + + # -- _is_valid_media OSError fallback -- def test_is_valid_media_returns_true_on_oserror(tmp_path):