security: fix #10 — validate media magic in first 16 bytes of stream
The previous flow streamed the full body to disk and called _is_valid_media after completion. A hostile server that omits Content-Type (so the early text/html guard doesn't fire) could burn up to MAX_DOWNLOAD_BYTES (500MB) of bandwidth and cache-dir write/delete churn before the post-download check rejected. Refactors _do_download to accumulate chunks into a small header buffer until at least 16 bytes have arrived, then runs _looks_like_media against the buffer before committing to writing the full payload. The 16-byte minimum handles servers that send tiny chunks (chunked encoding with 1-byte chunks, slow trickle, TCP MSS fragmentation) without false-failing on the first chunk. Extracts _looks_like_media(bytes) as a sibling to _is_valid_media (path) sharing the same magic-byte recognition. _looks_like_media fails closed on empty input — when called from the streaming validator, an empty header means the server returned nothing useful. _is_valid_media keeps its OSError-fallback open behavior for the on-disk path so transient EBUSY doesn't trigger a delete + re-download loop. Audit-Ref: SECURITY_AUDIT.md finding #10 Severity: Low
This commit is contained in:
parent
fef3c237f1
commit
b65f8da837
@ -122,6 +122,33 @@ _IMAGE_MAGIC = {
|
|||||||
b'PK\x03\x04': True, # ZIP (ugoira)
|
b'PK\x03\x04': True, # ZIP (ugoira)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Header size used by both _looks_like_media (in-memory bytes) and the
|
||||||
|
# in-stream early validator in _do_download. 16 bytes covers JPEG (3),
|
||||||
|
# PNG (8), GIF (6), WebP (12), MP4/MOV (8), WebM/MKV (4), and ZIP (4)
|
||||||
|
# magics with comfortable margin.
|
||||||
|
_MEDIA_HEADER_MIN = 16
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_media(header: bytes) -> bool:
|
||||||
|
"""Return True if the leading bytes match a known media magic.
|
||||||
|
|
||||||
|
Conservative on the empty case: an empty header is "unknown",
|
||||||
|
not "valid", because the streaming validator (audit #10) calls us
|
||||||
|
before any bytes have arrived means the server returned nothing
|
||||||
|
useful. The on-disk validator wraps this with an OSError fallback
|
||||||
|
that returns True instead — see _is_valid_media.
|
||||||
|
"""
|
||||||
|
if not header:
|
||||||
|
return False
|
||||||
|
if header.startswith(b'<') or header.startswith(b'<!'):
|
||||||
|
return False
|
||||||
|
for magic in _IMAGE_MAGIC:
|
||||||
|
if header.startswith(magic):
|
||||||
|
return True
|
||||||
|
# Not a known magic and not HTML: treat as ok (some boorus serve
|
||||||
|
# exotic-but-legal containers we don't enumerate above).
|
||||||
|
return b'<html' not in header.lower() and b'<!doctype' not in header.lower()
|
||||||
|
|
||||||
|
|
||||||
def _is_valid_media(path: Path) -> bool:
|
def _is_valid_media(path: Path) -> bool:
|
||||||
"""Check if a file looks like actual media, not an HTML error page.
|
"""Check if a file looks like actual media, not an HTML error page.
|
||||||
@ -133,18 +160,11 @@ def _is_valid_media(path: Path) -> bool:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
header = f.read(16)
|
header = f.read(_MEDIA_HEADER_MIN)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
log.warning("Cannot read %s for validation (%s); treating as valid", path, e)
|
log.warning("Cannot read %s for validation (%s); treating as valid", path, e)
|
||||||
return True
|
return True
|
||||||
if not header or header.startswith(b'<') or header.startswith(b'<!'):
|
return _looks_like_media(header)
|
||||||
return False
|
|
||||||
# Check for known magic bytes
|
|
||||||
for magic in _IMAGE_MAGIC:
|
|
||||||
if header.startswith(magic):
|
|
||||||
return True
|
|
||||||
# If not a known type but not HTML, assume it's ok
|
|
||||||
return b'<html' not in header.lower() and b'<!doctype' not in header.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def _ext_from_url(url: str) -> str:
|
def _ext_from_url(url: str) -> str:
|
||||||
@ -429,7 +449,30 @@ async def _do_download(
|
|||||||
f"Download too large: {total} bytes (cap {MAX_DOWNLOAD_BYTES})"
|
f"Download too large: {total} bytes (cap {MAX_DOWNLOAD_BYTES})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if total >= STREAM_TO_DISK_THRESHOLD:
|
# Audit #10: accumulate the leading bytes (≥16) before
|
||||||
|
# committing to writing the rest. A hostile server that omits
|
||||||
|
# Content-Type and ignores the HTML check could otherwise
|
||||||
|
# stream up to MAX_DOWNLOAD_BYTES of garbage to disk before
|
||||||
|
# the post-download _is_valid_media check rejects and deletes
|
||||||
|
# it. We accumulate across chunks because slow servers (or
|
||||||
|
# chunked encoding with tiny chunks) can deliver fewer than
|
||||||
|
# 16 bytes in the first chunk and validation would false-fail.
|
||||||
|
use_large = total >= STREAM_TO_DISK_THRESHOLD
|
||||||
|
chunk_iter = resp.aiter_bytes(64 * 1024 if use_large else 8192)
|
||||||
|
|
||||||
|
header_buf = bytearray()
|
||||||
|
async for chunk in chunk_iter:
|
||||||
|
header_buf.extend(chunk)
|
||||||
|
if len(header_buf) >= _MEDIA_HEADER_MIN:
|
||||||
|
break
|
||||||
|
if len(header_buf) > MAX_DOWNLOAD_BYTES:
|
||||||
|
raise ValueError(
|
||||||
|
f"Download exceeded cap mid-stream: {len(header_buf)} bytes"
|
||||||
|
)
|
||||||
|
if not _looks_like_media(bytes(header_buf)):
|
||||||
|
raise ValueError("Downloaded data is not valid media")
|
||||||
|
|
||||||
|
if use_large:
|
||||||
# Large download: stream to tempfile in the same dir, atomic replace.
|
# Large download: stream to tempfile in the same dir, atomic replace.
|
||||||
local.parent.mkdir(parents=True, exist_ok=True)
|
local.parent.mkdir(parents=True, exist_ok=True)
|
||||||
fd, tmp_name = tempfile.mkstemp(
|
fd, tmp_name = tempfile.mkstemp(
|
||||||
@ -437,9 +480,12 @@ async def _do_download(
|
|||||||
)
|
)
|
||||||
tmp_path = Path(tmp_name)
|
tmp_path = Path(tmp_name)
|
||||||
try:
|
try:
|
||||||
downloaded = 0
|
downloaded = len(header_buf)
|
||||||
with os.fdopen(fd, "wb") as out:
|
with os.fdopen(fd, "wb") as out:
|
||||||
async for chunk in resp.aiter_bytes(64 * 1024):
|
out.write(header_buf)
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(downloaded, total)
|
||||||
|
async for chunk in chunk_iter:
|
||||||
out.write(chunk)
|
out.write(chunk)
|
||||||
downloaded += len(chunk)
|
downloaded += len(chunk)
|
||||||
if downloaded > MAX_DOWNLOAD_BYTES:
|
if downloaded > MAX_DOWNLOAD_BYTES:
|
||||||
@ -457,9 +503,11 @@ async def _do_download(
|
|||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
# Small/unknown size: buffer in memory, write whole.
|
# Small/unknown size: buffer in memory, write whole.
|
||||||
chunks: list[bytes] = []
|
chunks: list[bytes] = [bytes(header_buf)]
|
||||||
downloaded = 0
|
downloaded = len(header_buf)
|
||||||
async for chunk in resp.aiter_bytes(8192):
|
if progress_callback:
|
||||||
|
progress_callback(downloaded, total)
|
||||||
|
async for chunk in chunk_iter:
|
||||||
chunks.append(chunk)
|
chunks.append(chunk)
|
||||||
downloaded += len(chunk)
|
downloaded += len(chunk)
|
||||||
if downloaded > MAX_DOWNLOAD_BYTES:
|
if downloaded > MAX_DOWNLOAD_BYTES:
|
||||||
|
|||||||
@ -212,6 +212,102 @@ def test_download_cap_running_total_aborts(tmp_path, monkeypatch):
|
|||||||
assert not local.exists()
|
assert not local.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# -- _looks_like_media (audit finding #10) --
|
||||||
|
|
||||||
|
|
||||||
|
def test_looks_like_media_jpeg_magic_recognised():
|
||||||
|
from booru_viewer.core.cache import _looks_like_media
|
||||||
|
assert _looks_like_media(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01") is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_looks_like_media_png_magic_recognised():
|
||||||
|
from booru_viewer.core.cache import _looks_like_media
|
||||||
|
assert _looks_like_media(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_looks_like_media_webm_magic_recognised():
|
||||||
|
from booru_viewer.core.cache import _looks_like_media
|
||||||
|
# EBML header (Matroska/WebM): 1A 45 DF A3
|
||||||
|
assert _looks_like_media(b"\x1aE\xdf\xa3" + b"\x00" * 20) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_looks_like_media_html_rejected():
|
||||||
|
from booru_viewer.core.cache import _looks_like_media
|
||||||
|
assert _looks_like_media(b"<!doctype html><html><body>") is False
|
||||||
|
assert _looks_like_media(b"<html><head>") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_looks_like_media_empty_rejected():
|
||||||
|
"""An empty buffer means the server returned nothing useful — fail
|
||||||
|
closed (rather than the on-disk validator's open-on-error fallback)."""
|
||||||
|
from booru_viewer.core.cache import _looks_like_media
|
||||||
|
assert _looks_like_media(b"") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_looks_like_media_unknown_magic_accepted():
|
||||||
|
"""Non-HTML, non-magic bytes are conservative-OK — some boorus
|
||||||
|
serve exotic-but-legal containers we don't enumerate."""
|
||||||
|
from booru_viewer.core.cache import _looks_like_media
|
||||||
|
assert _looks_like_media(b"random non-html data ") is True
|
||||||
|
|
||||||
|
|
||||||
|
# -- _do_download early header validation (audit finding #10) --
|
||||||
|
|
||||||
|
|
||||||
|
def test_do_download_early_rejects_html_payload(tmp_path):
|
||||||
|
"""A hostile server that returns HTML in the body (omitting
|
||||||
|
Content-Type so the early text/html guard doesn't fire) must be
|
||||||
|
caught by the magic-byte check before any bytes land on disk.
|
||||||
|
Audit finding #10: this used to wait for the full download to
|
||||||
|
complete before _is_valid_media rejected, wasting bandwidth."""
|
||||||
|
response = _FakeResponse(
|
||||||
|
headers={"content-length": "0"}, # no Content-Type, no length
|
||||||
|
chunks=[b"<!doctype html><html><body>500</body></html>"],
|
||||||
|
)
|
||||||
|
client = _FakeClient(response)
|
||||||
|
local = tmp_path / "out.jpg"
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="not valid media"):
|
||||||
|
asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
|
||||||
|
|
||||||
|
assert not local.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_do_download_early_rejects_html_across_tiny_chunks(tmp_path):
|
||||||
|
"""The accumulator must combine chunks smaller than the 16-byte
|
||||||
|
minimum so a server delivering one byte at a time can't slip
|
||||||
|
past the magic-byte check."""
|
||||||
|
response = _FakeResponse(
|
||||||
|
headers={"content-length": "0"},
|
||||||
|
chunks=[b"<!", b"do", b"ct", b"yp", b"e ", b"ht", b"ml", b">", b"x" * 100],
|
||||||
|
)
|
||||||
|
client = _FakeClient(response)
|
||||||
|
local = tmp_path / "out.jpg"
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="not valid media"):
|
||||||
|
asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
|
||||||
|
|
||||||
|
assert not local.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_do_download_writes_valid_jpeg_after_early_validation(tmp_path):
|
||||||
|
"""A real JPEG-like header passes the early check and the rest
|
||||||
|
of the stream is written through to disk. Header bytes must
|
||||||
|
appear in the final file (not be silently dropped)."""
|
||||||
|
body = b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01" + b"PAYLOAD" + b"\xff\xd9"
|
||||||
|
response = _FakeResponse(
|
||||||
|
headers={"content-length": str(len(body)), "content-type": "image/jpeg"},
|
||||||
|
chunks=[body[:8], body[8:]], # split mid-magic
|
||||||
|
)
|
||||||
|
client = _FakeClient(response)
|
||||||
|
local = tmp_path / "out.jpg"
|
||||||
|
|
||||||
|
asyncio.run(_do_download(client, "http://example.test/x.jpg", {}, local, None))
|
||||||
|
|
||||||
|
assert local.exists()
|
||||||
|
assert local.read_bytes() == body
|
||||||
|
|
||||||
|
|
||||||
# -- _is_valid_media OSError fallback --
|
# -- _is_valid_media OSError fallback --
|
||||||
|
|
||||||
def test_is_valid_media_returns_true_on_oserror(tmp_path):
|
def test_is_valid_media_returns_true_on_oserror(tmp_path):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user