Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 39 additions & 21 deletions Lib/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
except ImportError:
raise CompressionError("bz2 module is not available") from None
if mode == "r":
self.dbuf = b""
self.cmp = bz2.BZ2Decompressor()
self.exception = OSError
else:
Expand All @@ -392,7 +391,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
except ImportError:
raise CompressionError("lzma module is not available") from None
if mode == "r":
self.dbuf = b""
self.cmp = lzma.LZMADecompressor()
self.exception = lzma.LZMAError
else:
Expand All @@ -403,7 +401,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
except ImportError:
raise CompressionError("compression.zstd module is not available") from None
if mode == "r":
self.dbuf = b""
self.cmp = zstd.ZstdDecompressor()
self.exception = zstd.ZstdError
else:
Expand Down Expand Up @@ -485,7 +482,6 @@ def _init_read_gz(self):
"""Initialize for reading a gzip compressed fileobj.
"""
self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
self.dbuf = b""

# taken from gzip.GzipFile with some alterations
if self.__read(2) != b"\037\213":
Expand Down Expand Up @@ -543,26 +539,48 @@ def _read(self, size):
if self.comptype == "tar":
return self.__read(size)

c = len(self.dbuf)
t = [self.dbuf]
c = 0
t = []
while c < size:
# Skip underlying buffer to avoid unaligned double buffering.
if self.buf:
buf = self.buf
self.buf = b""
if self.comptype == "gz":
# zlib interface is different than others.
# It returns data in unconsumed_tail.
if self.buf:
cbuf = self.buf
self.buf = b""
else:
cbuf = self.fileobj.read(self.bufsize)
if not cbuf:
break

try:
dbuf = self.cmp.decompress(cbuf, size - c)
self.buf = self.cmp.unconsumed_tail
except self.exception as e:
raise ReadError("invalid compressed data") from e
else:
buf = self.fileobj.read(self.bufsize)
if not buf:
break
try:
buf = self.cmp.decompress(buf)
except self.exception as e:
raise ReadError("invalid compressed data") from e
t.append(buf)
c += len(buf)
# Other decompressors have needs_input.
# decompress() can buffer data internally.
if self.cmp.needs_input:
cbuf = self.fileobj.read(self.bufsize)
if not cbuf:
break
else:
cbuf = b""

try:
dbuf = self.cmp.decompress(cbuf, size - c)
except self.exception as e:
raise ReadError("invalid compressed data") from e

t.append(dbuf)
c += len(dbuf)

t = b"".join(t)
self.dbuf = t[size:]
return t[:size]
if len(t) > size:
# This would only happen if decompress() has a bug.
raise ReadError("decompress() returned too much data")
return t

def __read(self, size):
"""Return size bytes from stream. If internal buffer is empty,
Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ Bas van Beek
Ian Beer
Stefan Behnel
Reimer Behrends
Tomi Belan
Maxime Bélanger
Ben Bell
Thomas Bellman
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix :mod:`tarfile` performance issue when reading archives in streaming mode
(e.g. ``r|*``).
Loading