Skip to content

Commit 1f28fdd

Browse files
committed
gh-103477: Read and write gzip header and trailer with zlib
RHEL, SLES and Ubuntu for IBM zSystems (aka s390x) ship with a zlib optimization [1] that significantly improves deflate and inflate performance on this platform by using a specialized CPU instruction. This instruction not only compresses the data, but also computes a checksum. At the moment Pyhton's gzip support performs compression and checksum calculation separately, which creates unnecessary overhead on s390x. The reason is that Python needs to write specific values into gzip header; and when this support was introduced in year 1997, there was indeed no better way to do this. Since v1.2.2.1 (2011) zlib provides inflateGetHeader() and deflateSetHeader() functions for that, so Python does not have to deal with the exact header and trailer formats anymore. Add the new interfaces to zlibmodule.c that make use of these functions: * Add mtime argument to zlib.compress(). * Add mtime and fname arguments to zlib.compressobj(). * Add gz_header_mtime and gz_header_done propeties to ZlibDecompressor. In Python modules, replace raw streams with gzip streams, make use of the new interfaces, and remove all mentions of crc32. In addition to the new interfaces above, there is an additional change in behavior that the users can see: for malformed gzip headers and trailers, decompression now raises zlib.error instead of BadGzipFile. However, this is allowed by today's spec. 📜🤖 NEWS entry added by blurb_it. [1] madler/zlib#410
1 parent 3fb7c60 commit 1f28fdd

12 files changed

+216
-237
lines changed

Include/internal/pycore_global_objects_fini_generated.h

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_global_strings.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ struct _Py_global_strings {
431431
STRUCT_FOR_ID(fix_imports)
432432
STRUCT_FOR_ID(flags)
433433
STRUCT_FOR_ID(flush)
434+
STRUCT_FOR_ID(fname)
434435
STRUCT_FOR_ID(follow_symlinks)
435436
STRUCT_FOR_ID(format)
436437
STRUCT_FOR_ID(frequency)
@@ -547,6 +548,7 @@ struct _Py_global_strings {
547548
STRUCT_FOR_ID(modules)
548549
STRUCT_FOR_ID(mro)
549550
STRUCT_FOR_ID(msg)
551+
STRUCT_FOR_ID(mtime)
550552
STRUCT_FOR_ID(mycmp)
551553
STRUCT_FOR_ID(n)
552554
STRUCT_FOR_ID(n_arg)

Include/internal/pycore_runtime_init_generated.h

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_unicodeobject_generated.h

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/gzip.py

Lines changed: 33 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -217,12 +217,15 @@ def __init__(self, filename=None, mode=None,
217217
FutureWarning, 2)
218218
self.mode = WRITE
219219
self._init_write(filename)
220+
if mtime is None:
221+
mtime = int(time.time())
220222
self.compress = zlib.compressobj(compresslevel,
221223
zlib.DEFLATED,
222-
-zlib.MAX_WBITS,
224+
16 + zlib.MAX_WBITS,
223225
zlib.DEF_MEM_LEVEL,
224-
0)
225-
self._write_mtime = mtime
226+
0,
227+
mtime=mtime,
228+
fname=self._encode_fname())
226229
self._buffer_size = _WRITE_BUFFER_SIZE
227230
self._buffer = io.BufferedWriter(_WriteBufferStream(self),
228231
buffer_size=self._buffer_size)
@@ -231,9 +234,6 @@ def __init__(self, filename=None, mode=None,
231234

232235
self.fileobj = fileobj
233236

234-
if self.mode == WRITE:
235-
self._write_gzip_header(compresslevel)
236-
237237
@property
238238
def mtime(self):
239239
"""Last modification time read from stream, or None"""
@@ -245,7 +245,6 @@ def __repr__(self):
245245

246246
def _init_write(self, filename):
247247
self.name = filename
248-
self.crc = zlib.crc32(b"")
249248
self.size = 0
250249
self.writebuf = []
251250
self.bufsize = 0
@@ -256,9 +255,7 @@ def tell(self):
256255
self._buffer.flush()
257256
return super().tell()
258257

259-
def _write_gzip_header(self, compresslevel):
260-
self.fileobj.write(b'\037\213') # magic header
261-
self.fileobj.write(b'\010') # compression method
258+
def _encode_fname(self):
262259
try:
263260
# RFC 1952 requires the FNAME field to be Latin-1. Do not
264261
# include filenames that cannot be represented that way.
@@ -269,24 +266,7 @@ def _write_gzip_header(self, compresslevel):
269266
fname = fname[:-3]
270267
except UnicodeEncodeError:
271268
fname = b''
272-
flags = 0
273-
if fname:
274-
flags = FNAME
275-
self.fileobj.write(chr(flags).encode('latin-1'))
276-
mtime = self._write_mtime
277-
if mtime is None:
278-
mtime = time.time()
279-
write32u(self.fileobj, int(mtime))
280-
if compresslevel == _COMPRESS_LEVEL_BEST:
281-
xfl = b'\002'
282-
elif compresslevel == _COMPRESS_LEVEL_FAST:
283-
xfl = b'\004'
284-
else:
285-
xfl = b'\000'
286-
self.fileobj.write(xfl)
287-
self.fileobj.write(b'\377')
288-
if fname:
289-
self.fileobj.write(fname + b'\000')
269+
return fname
290270

291271
def write(self,data):
292272
self._check_not_closed()
@@ -311,7 +291,6 @@ def _write_raw(self, data):
311291
if length > 0:
312292
self.fileobj.write(self.compress.compress(data))
313293
self.size += length
314-
self.crc = zlib.crc32(data, self.crc)
315294
self.offset += length
316295

317296
return length
@@ -355,9 +334,6 @@ def close(self):
355334
if self.mode == WRITE:
356335
self._buffer.flush()
357336
fileobj.write(self.compress.flush())
358-
write32u(fileobj, self.crc)
359-
# self.size may exceed 2 GiB, or even 4 GiB
360-
write32u(fileobj, self.size & 0xffffffff)
361337
elif self.mode == READ:
362338
self._buffer.close()
363339
finally:
@@ -424,78 +400,17 @@ def readline(self, size=-1):
424400
return self._buffer.readline(size)
425401

426402

427-
def _read_exact(fp, n):
428-
'''Read exactly *n* bytes from `fp`
429-
430-
This method is required because fp may be unbuffered,
431-
i.e. return short reads.
432-
'''
433-
data = fp.read(n)
434-
while len(data) < n:
435-
b = fp.read(n - len(data))
436-
if not b:
437-
raise EOFError("Compressed file ended before the "
438-
"end-of-stream marker was reached")
439-
data += b
440-
return data
441-
442-
443-
def _read_gzip_header(fp):
444-
'''Read a gzip header from `fp` and progress to the end of the header.
445-
446-
Returns last mtime if header was present or None otherwise.
447-
'''
448-
magic = fp.read(2)
449-
if magic == b'':
450-
return None
451-
452-
if magic != b'\037\213':
453-
raise BadGzipFile('Not a gzipped file (%r)' % magic)
454-
455-
(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
456-
if method != 8:
457-
raise BadGzipFile('Unknown compression method')
458-
459-
if flag & FEXTRA:
460-
# Read & discard the extra field, if present
461-
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
462-
_read_exact(fp, extra_len)
463-
if flag & FNAME:
464-
# Read and discard a null-terminated string containing the filename
465-
while True:
466-
s = fp.read(1)
467-
if not s or s==b'\000':
468-
break
469-
if flag & FCOMMENT:
470-
# Read and discard a null-terminated string containing a comment
471-
while True:
472-
s = fp.read(1)
473-
if not s or s==b'\000':
474-
break
475-
if flag & FHCRC:
476-
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
477-
return last_mtime
478-
479-
480403
class _GzipReader(_compression.DecompressReader):
481404
def __init__(self, fp):
482405
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
483-
wbits=-zlib.MAX_WBITS)
406+
wbits=16 + zlib.MAX_WBITS)
484407
# Set flag indicating start of a new member
485408
self._new_member = True
486409
self._last_mtime = None
487410

488411
def _init_read(self):
489-
self._crc = zlib.crc32(b"")
490412
self._stream_size = 0 # Decompressed size of unconcatenated stream
491413

492-
def _read_gzip_header(self):
493-
last_mtime = _read_gzip_header(self._fp)
494-
if last_mtime is None:
495-
return False
496-
self._last_mtime = last_mtime
497-
return True
498-
499414
def read(self, size=-1):
500415
if size < 0:
501416
return self.readall()
@@ -509,33 +424,35 @@ def read(self, size=-1):
509424
while True:
510425
if self._decompressor.eof:
511426
# Ending case: we've come to the end of a member in the file,
512-
# so finish up this member, and read a new gzip header.
513-
# Check the CRC and file size, and set the flag so we read
514-
# a new member
427+
# so finish up this member and set the flag, so that we read a
428+
# new member
515429
self._read_eof()
516430
self._new_member = True
517431
self._decompressor = self._decomp_factory(
518432
**self._decomp_args)
519433

520-
if self._new_member:
521-
# If the _new_member flag is set, we have to
522-
# jump to the next member, if there is one.
523-
self._init_read()
524-
if not self._read_gzip_header():
525-
self._size = self._pos
526-
return b""
527-
self._new_member = False
528-
529434
# Read a chunk of data from the file
530435
if self._decompressor.needs_input:
531436
buf = self._fp.read(READ_BUFFER_SIZE)
437+
if self._new_member:
438+
# If the _new_member flag is set, we have to
439+
# jump to the next member, if there is one.
440+
self._init_read()
441+
if not buf:
442+
self._size = self._pos
443+
return b""
444+
self._new_member = False
532445
uncompress = self._decompressor.decompress(buf, size)
533446
else:
447+
assert not self._new_member
534448
uncompress = self._decompressor.decompress(b"", size)
535449

450+
if self._decompressor.gz_header_done:
451+
self._last_mtime = self._decompressor.gz_header_mtime
452+
536453
if self._decompressor.unused_data != b"":
537454
# Prepend the already read bytes to the fileobj so they can
538-
# be seen by _read_eof() and _read_gzip_header()
455+
# be seen by _read_eof()
539456
self._fp.prepend(self._decompressor.unused_data)
540457

541458
if uncompress != b"":
@@ -544,23 +461,12 @@ def read(self, size=-1):
544461
raise EOFError("Compressed file ended before the "
545462
"end-of-stream marker was reached")
546463

547-
self._crc = zlib.crc32(uncompress, self._crc)
548464
self._stream_size += len(uncompress)
549465
self._pos += len(uncompress)
550466
return uncompress
551467

552468
def _read_eof(self):
553469
# We've read to the end of the file
554-
# We check that the computed CRC and size of the
555-
# uncompressed data matches the stored values. Note that the size
556-
# stored is the true file size mod 2**32.
557-
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
558-
if crc32 != self._crc:
559-
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
560-
hex(self._crc)))
561-
elif isize != (self._stream_size & 0xffffffff):
562-
raise BadGzipFile("Incorrect length of data produced")
563-
564470
# Gzip files can be padded with zeroes and still have archives.
565471
# Consume all zero bytes and set the file position to the first
566472
# non-zero byte. See http://www.gzip.org/#faq8
@@ -575,68 +481,32 @@ def _rewind(self):
575481
self._new_member = True
576482

577483

578-
def _create_simple_gzip_header(compresslevel: int,
579-
mtime = None) -> bytes:
580-
"""
581-
Write a simple gzip header with no extra fields.
582-
:param compresslevel: Compresslevel used to determine the xfl bytes.
583-
:param mtime: The mtime (must support conversion to a 32-bit integer).
584-
:return: A bytes object representing the gzip header.
585-
"""
586-
if mtime is None:
587-
mtime = time.time()
588-
if compresslevel == _COMPRESS_LEVEL_BEST:
589-
xfl = 2
590-
elif compresslevel == _COMPRESS_LEVEL_FAST:
591-
xfl = 4
592-
else:
593-
xfl = 0
594-
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
595-
# fields added to header), mtime, xfl and os (255 for unknown OS).
596-
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
597-
598-
599484
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
600485
"""Compress data in one shot and return the compressed string.
601486
602487
compresslevel sets the compression level in range of 0-9.
603488
mtime can be used to set the modification time. The modification time is
604489
set to the current time by default.
605490
"""
606-
if mtime == 0:
607-
# Use zlib as it creates the header with 0 mtime by default.
608-
# This is faster and with less overhead.
609-
return zlib.compress(data, level=compresslevel, wbits=31)
610-
header = _create_simple_gzip_header(compresslevel, mtime)
611-
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
612-
# Wbits=-15 creates a raw deflate block.
613-
return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
614-
trailer)
491+
if mtime is None:
492+
mtime = int(time.time())
493+
return zlib.compress(data, level=compresslevel, wbits=31, mtime=mtime)
615494

616495

617496
def decompress(data):
618497
"""Decompress a gzip compressed string in one shot.
619498
Return the decompressed string.
620499
"""
621500
decompressed_members = []
622-
while True:
623-
fp = io.BytesIO(data)
624-
if _read_gzip_header(fp) is None:
625-
return b"".join(decompressed_members)
626-
# Use a zlib raw deflate compressor
627-
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
628-
# Read all the data except the header
629-
decompressed = do.decompress(data[fp.tell():])
630-
if not do.eof or len(do.unused_data) < 8:
501+
while data:
502+
do = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS)
503+
decompressed = do.decompress(data)
504+
if not do.eof:
631505
raise EOFError("Compressed file ended before the end-of-stream "
632506
"marker was reached")
633-
crc, length = struct.unpack("<II", do.unused_data[:8])
634-
if crc != zlib.crc32(decompressed):
635-
raise BadGzipFile("CRC check failed")
636-
if length != (len(decompressed) & 0xffffffff):
637-
raise BadGzipFile("Incorrect length of data produced")
638507
decompressed_members.append(decompressed)
639-
data = do.unused_data[8:].lstrip(b"\x00")
508+
data = do.unused_data.lstrip(b"\x00")
509+
return b"".join(decompressed_members)
640510

641511

642512
def main():

0 commit comments

Comments
 (0)