Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve compressed NRRD read performance #92

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 30 additions & 17 deletions nrrd/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

from nrrd.parsers import *

# Reading and writing gzipped data directly gives problems when the uncompressed
# data is larger than 4GB (2^32). Therefore we'll read and write the data in
# chunks. How this affects speed and/or memory usage is something to be analyzed
# further. The following two values define the size of the chunks.
_READ_CHUNKSIZE = 2 ** 20
# Older versions of Python had issues when uncompressed data was larger than 4GB (2^32). This should be fixed in latest
# version of Python 2.7 and all versions of Python 3. The fix for this issue is to read the data in smaller chunks.
# Chunk size is set to be large at 1GB to improve performance. If issues arise decompressing larger files, try to reduce
# this value
_READ_CHUNKSIZE = 2 ** 32

_NRRD_REQUIRED_FIELDS = ['dimension', 'type', 'encoding', 'sizes']

Expand Down Expand Up @@ -369,7 +369,7 @@ def read_data(header, fh=None, filename=None):
fh.readline()
else:
raise NRRDError('Invalid lineskip, allowed values are greater than or equal to 0')

# Skip the requested number of bytes or seek backward, and then parse the data using NumPy
if byte_skip < -1:
raise NRRDError('Invalid byteskip, allowed values are greater than or equal to -1')
Expand All @@ -380,9 +380,9 @@ def read_data(header, fh=None, filename=None):
else:
# The only case left should be: byte_skip == -1 and header['encoding'] == 'gzip'
byte_skip = -dtype.itemsize * total_data_points

# If a compression encoding is used, then byte skip AFTER decompressing
if header['encoding'] == 'raw':
if header['encoding'] == 'raw':
data = np.fromfile(fh, dtype)
elif header['encoding'] in ['ASCII', 'ascii', 'text', 'txt']:
data = np.fromfile(fh, dtype, sep=' ')
Expand All @@ -397,20 +397,33 @@ def read_data(header, fh=None, filename=None):
raise NRRDError('Unsupported encoding: "%s"' % header['encoding'])

# Loop through the file and read a chunk at a time (see _READ_CHUNKSIZE why it is read in chunks)
decompressed_data = b''
while True:
chunk = fh.read(_READ_CHUNKSIZE)
decompressed_data = bytearray()

# Read all of the remaining data from the file
# Obtain the length of the compressed data since we will be using it repeatedly, more efficient
compressed_data = fh.read()
compressed_data_len = len(compressed_data)
start_index = 0

# Loop through data and decompress it chunk by chunk
while start_index < compressed_data_len:
# Calculate the end index = start index plus chunk size
# Set to the string length to read the remaining chunk at the end
end_index = min(start_index + _READ_CHUNKSIZE, compressed_data_len)

# Decompress and append data
decompressed_data += decompobj.decompress(compressed_data[start_index:end_index])

# If chunk is None, then file is at end, break out of loop
if not chunk:
break
# Update start index
start_index = end_index

# Decompress the data and add it to the decompressed data
decompressed_data += decompobj.decompress(chunk)
# Delete the compressed data since we do not need it anymore
# This could potentially be using a lot of memory
del compressed_data

# Byte skip is applied AFTER the decompression. Skip first x bytes of the decompressed data and parse it using
# NumPy
data = np.fromstring(decompressed_data[byte_skip:], dtype)
data = np.frombuffer(decompressed_data[byte_skip:], dtype)

# Close the file
# Even if opened using with keyword, closing it does not hurt
Expand Down
22 changes: 11 additions & 11 deletions nrrd/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from nrrd.formatters import *
from nrrd.reader import _get_field_type

# Reading and writing gzipped data directly gives problems when the uncompressed
# data is larger than 4GB (2^32). Therefore we'll read and write the data in
# chunks. How this affects speed and/or memory usage is something to be analyzed
# further. The following two values define the size of the chunks.
# Older versions of Python had issues when uncompressed data was larger than 4GB (2^32). This should be fixed in latest
# version of Python 2.7 and all versions of Python 3. The fix for this issue is to read the data in smaller chunks. The
# chunk size is set to be small here at 1MB since performance did not vary much based on the chunk size. A smaller chunk
# size has the benefit of using less RAM at once.
_WRITE_CHUNKSIZE = 2 ** 20

_NRRD_FIELD_ORDER = [
Expand Down Expand Up @@ -67,6 +67,7 @@
'B': 'big'
}


def _format_field_value(value, field_type):
if field_type == 'int':
return format_number(value)
Expand Down Expand Up @@ -94,7 +95,7 @@ def _format_field_value(value, field_type):


def write(filename, data, header=None, detached_header=False, relative_data_path=True, custom_field_map=None,
compression_level=9):
compression_level=9):
"""Write :class:`numpy.ndarray` to NRRD file

The :obj:`filename` parameter specifies the absolute or relative filename to write the NRRD file to. If the
Expand Down Expand Up @@ -286,16 +287,15 @@ def _write_data(data, fh, header, compression_level=None):
raise NRRDError('Unsupported encoding: "%s"' % header['encoding'])

# Write the data in chunks (see _WRITE_CHUNKSIZE declaration for more information why)
# Obtain the length of the data since we will be using it repeatedly, more efficient
start_index = 0
raw_data_len = len(raw_data)

# Loop through the data and write it by chunk
while start_index < len(raw_data):
while start_index < raw_data_len:
# End index is start index plus the chunk size
end_index = start_index + _WRITE_CHUNKSIZE

# If the end index is past the data size, then clamp it to the data size
if end_index > len(raw_data):
end_index = len(raw_data)
# Set to the string length to read the remaining chunk at the end
end_index = min(start_index + _WRITE_CHUNKSIZE, raw_data_len)

# Write the compressed data
fh.write(compressobj.compress(raw_data[start_index:end_index]))
Expand Down