python · iamsharduld · Jun 15, 2026
diff --git a/Lib/test/test_wave.py b/Lib/test/test_wave.py
@@ -10,6 +10,30 @@
 import wave
 
 
+class _ReadSizeRecorder(io.BytesIO):
+    # A seekable file that remembers the largest size ever passed to read()
+    # (so a test can check that wave does not request far more data than the
+    # file actually holds, which on a real file would pre-allocate it), and
+    # that rejects seeks to offsets overflowing a C ssize_t the way a 32-bit
+    # platform such as WASI does (so a test can check that wave never seeks
+    # to an untrusted chunk size).
+    _SSIZE_MAX = (1 << 31) - 1
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_read_size = 0
+
+    def read(self, size=-1):
+        if size is not None and size >= 0:
+            self.max_read_size = max(self.max_read_size, size)
+        return super().read(size)
+
+    def seek(self, pos, whence=0):
+        if abs(pos) > self._SSIZE_MAX:
+            raise OverflowError("Python int too large to convert to C ssize_t")
+        return super().seek(pos, whence)
+
+
 class WaveTest(audiotests.AudioWriteTests,
                audiotests.AudioTestsWithSourceFile):
     module = wave
@@ -333,6 +357,25 @@ def test_read_wrong_sample_width(self):
         with self.assertRaisesRegex(wave.Error, 'bad sample width'):
             wave.open(io.BytesIO(b))
 
+    def test_read_data_chunk_size_larger_than_file(self):
+        # gh-151308: a data chunk header may claim far more data than the
+        # file actually contains.  readframes() must not request (and so,
+        # on a real file, pre-allocate) the claimed size; reads on a
+        # seekable file are clamped to the bytes actually available.
+        real_data = b'\x00' * 10
+        b = b'RIFF' + struct.pack('<L', 0xFFFFFFFF) + b'WAVE'
+        b += b'fmt ' + struct.pack('<LHHLLHH', 16, 1, 1, 11025, 11025, 1, 8)
+        b += b'data' + struct.pack('<L', 0xFFFFFFFF)  # bogus, ~4 GiB
+        b += real_data
+        # _ReadSizeRecorder also raises OverflowError on a huge seek offset,
+        # so this exercises the 32-bit (e.g. WASI) path too.
+        f = _ReadSizeRecorder(b)
+        with wave.open(f, 'rb') as r:
+            data = r.readframes(r.getnframes())
+        self.assertEqual(data, real_data)
+        # The bogus ~4 GiB size must never reach the underlying read().
+        self.assertLessEqual(f.max_read_size, len(b))
+
     def test_open_in_write_raises(self):
         # gh-136523: Wave_write.__del__ should not throw
         with support.catch_unraisable_exception() as cm:

diff --git a/Lib/wave.py b/Lib/wave.py
@@ -189,6 +189,30 @@ def read(self, size=-1):
             size = self.chunksize - self.size_read
         if size > self.chunksize - self.size_read:
             size = self.chunksize - self.size_read
+        # The chunk size comes from the file header and is not trustworthy:
+        # a truncated or maliciously crafted file can claim a size far larger
+        # than the data actually present, which would make the read() below
+        # pre-allocate that many bytes (gh-151308).  When the underlying file
+        # is seekable, clamp the request to the bytes physically available so
+        # we never allocate more than the file can provide.  This leaves the
+        # data returned for valid files unchanged, since the requested bytes
+        # are always present.  We probe with tell()/seek() rather than trust
+        # seekable(), since some file objects report being seekable yet raise
+        # on the actual call; on any failure we fall back to the original
+        # behaviour.  We only probe the raw file object, never a parent
+        # _Chunk: seeking a _Chunk would seek the raw file to its (untrusted)
+        # chunk size, which may overflow on 32-bit platforms.  Clamping the
+        # raw read protects the nested chunks too, as they read through it.
+        if size > 0 and not isinstance(self.file, _Chunk):
+            try:
+                here = self.file.tell()
+                end = self.file.seek(0, 2)
+                self.file.seek(here, 0)
+            except (OSError, ValueError):
+                pass
+            else:
+                if isinstance(end, int):
+                    size = min(size, max(0, end - here))
         data = self.file.read(size)
         self.size_read = self.size_read + len(data)
         if self.size_read == self.chunksize and \

diff --git a/Misc/NEWS.d/next/Library/2026-06-15-13-04-03.gh-issue-151308.5gc0g-.rst b/Misc/NEWS.d/next/Library/2026-06-15-13-04-03.gh-issue-151308.5gc0g-.rst
@@ -0,0 +1,4 @@
+:meth:`wave.Wave_read.readframes` no longer attempts to pre-allocate a huge
+buffer when the data chunk header of a truncated or malformed WAV file claims
+a size larger than the file actually contains.  When the underlying file is
+seekable, reads are now clamped to the number of bytes really available.