Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix offset error in _PaddedFile caused by concatenated gzips. #61

Merged
merged 3 commits into from
Mar 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ Changelog
.. This document is user facing. Please word the changes in such a way
.. that users understand how the changes affect the new version.

version 0.8.1
-----------------
+ Fix a bug where multi-member gzip files where read incorrectly due to an
offset error. This was caused by ISA-L's decompressobj having a small
bitbuffer which was not taken properly into account in some circumstances.

version 0.8.0
-----------------
+ Speed up ``igzip.compress`` and ``igzip.decompress`` by improving the
Expand Down
31 changes: 28 additions & 3 deletions src/isal/igzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import sys
import time
from typing import List, Optional, SupportsInt
import _compression # noqa: I201 # Not third-party

from . import isal_zlib

Expand Down Expand Up @@ -204,11 +205,35 @@ def write(self, data):
return length


class _PaddedFile(gzip._PaddedFile):
# Overwrite _PaddedFile from gzip as its prepend method assumes that
# the prepended data is always read from its _buffer. Unfortunately in
# isal_zlib.decompressobj there is a bitbuffer as well which may be added.
# So an extra check is added to prepend to ensure no extra data in front
# of the buffer was present. (Negative self._read).
def prepend(self, prepend=b''):
if self._read is not None:
# Assume data was read since the last prepend() call
self._read -= len(prepend)
if self._read >= 0:
return
# If self._read is negative the data was read further back and
# the buffer needs to be reset.
self._buffer = prepend
self._length = len(self._buffer)
self._read = 0


class _IGzipReader(gzip._GzipReader):
def __init__(self, fp):
super().__init__(fp)
self._decomp_factory = isal_zlib.decompressobj
self._decompressor = self._decomp_factory(**self._decomp_args)
# Call the init method of gzip._GzipReader's parent here.
# It is not very invasive and allows us to override _PaddedFile
_compression.DecompressReader.__init__(
self, _PaddedFile(fp), isal_zlib.decompressobj,
wbits=-isal_zlib.MAX_WBITS)
# Set flag indicating start of a new member
self._new_member = True
self._last_mtime = None

def _add_read_data(self, data):
# Use faster isal crc32 calculation and update the stream size in place
Expand Down
Binary file added tests/data/concatenated.fastq.gz
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/test_igzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,11 @@ def test_header_corrupt():
def test_truncated_header(trunc):
with pytest.raises(EOFError):
igzip.decompress(trunc)


def test_concatenated_gzip():
concat = Path(__file__).parent / "data" / "concatenated.fastq.gz"
data = gzip.decompress(concat.read_bytes())
with igzip.open(concat, "rb") as igzip_h:
result = igzip_h.read()
assert data == result