diff --git a/nrrd/reader.py b/nrrd/reader.py index aef0b6e..aa980cd 100644 --- a/nrrd/reader.py +++ b/nrrd/reader.py @@ -8,11 +8,11 @@ from nrrd.parsers import * -# Reading and writing gzipped data directly gives problems when the uncompressed -# data is larger than 4GB (2^32). Therefore we'll read and write the data in -# chunks. How this affects speed and/or memory usage is something to be analyzed -# further. The following two values define the size of the chunks. -_READ_CHUNKSIZE = 2 ** 20 +# Older versions of Python had issues when uncompressed data was larger than 4GB (2^32). This should be fixed in latest +# version of Python 2.7 and all versions of Python 3. The fix for this issue is to read the data in smaller chunks. +# Chunk size is set to be large at 1GB to improve performance. If issues arise decompressing larger files, try to reduce +# this value +_READ_CHUNKSIZE = 2 ** 32 _NRRD_REQUIRED_FIELDS = ['dimension', 'type', 'encoding', 'sizes'] @@ -369,7 +369,7 @@ def read_data(header, fh=None, filename=None): fh.readline() else: raise NRRDError('Invalid lineskip, allowed values are greater than or equal to 0') - + # Skip the requested number of bytes or seek backward, and then parse the data using NumPy if byte_skip < -1: raise NRRDError('Invalid byteskip, allowed values are greater than or equal to -1') @@ -380,9 +380,9 @@ def read_data(header, fh=None, filename=None): else: # The only case left should be: byte_skip == -1 and header['encoding'] == 'gzip' byte_skip = -dtype.itemsize * total_data_points - + # If a compression encoding is used, then byte skip AFTER decompressing - if header['encoding'] == 'raw': + if header['encoding'] == 'raw': data = np.fromfile(fh, dtype) elif header['encoding'] in ['ASCII', 'ascii', 'text', 'txt']: data = np.fromfile(fh, dtype, sep=' ') @@ -397,20 +397,33 @@ def read_data(header, fh=None, filename=None): raise NRRDError('Unsupported encoding: "%s"' % header['encoding']) # Loop through the file and read a chunk at a time (see _READ_CHUNKSIZE why it is read in chunks) - decompressed_data = b'' - while True: - chunk = fh.read(_READ_CHUNKSIZE) + decompressed_data = bytearray() + + # Read all of the remaining data from the file + # Obtain the length of the compressed data since we will be using it repeatedly, more efficient + compressed_data = fh.read() + compressed_data_len = len(compressed_data) + start_index = 0 + + # Loop through data and decompress it chunk by chunk + while start_index < compressed_data_len: + # Calculate the end index = start index plus chunk size + # Set to the string length to read the remaining chunk at the end + end_index = min(start_index + _READ_CHUNKSIZE, compressed_data_len) + + # Decompress and append data + decompressed_data += decompobj.decompress(compressed_data[start_index:end_index]) - # If chunk is None, then file is at end, break out of loop - if not chunk: - break + # Update start index + start_index = end_index - # Decompress the data and add it to the decompressed data - decompressed_data += decompobj.decompress(chunk) + # Delete the compressed data since we do not need it anymore + # This could potentially be using a lot of memory + del compressed_data # Byte skip is applied AFTER the decompression. Skip first x bytes of the decompressed data and parse it using # NumPy - data = np.fromstring(decompressed_data[byte_skip:], dtype) + data = np.frombuffer(decompressed_data[byte_skip:], dtype) # Close the file # Even if opened using with keyword, closing it does not hurt diff --git a/nrrd/writer.py b/nrrd/writer.py index b628d47..4897560 100644 --- a/nrrd/writer.py +++ b/nrrd/writer.py @@ -8,10 +8,10 @@ from nrrd.formatters import * from nrrd.reader import _get_field_type -# Reading and writing gzipped data directly gives problems when the uncompressed -# data is larger than 4GB (2^32). Therefore we'll read and write the data in -# chunks. How this affects speed and/or memory usage is something to be analyzed -# further. The following two values define the size of the chunks. +# Older versions of Python had issues when uncompressed data was larger than 4GB (2^32). This should be fixed in latest +# version of Python 2.7 and all versions of Python 3. The fix for this issue is to read the data in smaller chunks. The +# chunk size is set to be small here at 1MB since performance did not vary much based on the chunk size. A smaller chunk +# size has the benefit of using less RAM at once. _WRITE_CHUNKSIZE = 2 ** 20 _NRRD_FIELD_ORDER = [ @@ -67,6 +67,7 @@ 'B': 'big' } + def _format_field_value(value, field_type): if field_type == 'int': return format_number(value) @@ -94,7 +95,7 @@ def _format_field_value(value, field_type): def write(filename, data, header=None, detached_header=False, relative_data_path=True, custom_field_map=None, - compression_level=9): + compression_level=9): """Write :class:`numpy.ndarray` to NRRD file The :obj:`filename` parameter specifies the absolute or relative filename to write the NRRD file to. If the @@ -286,16 +287,15 @@ def _write_data(data, fh, header, compression_level=None): raise NRRDError('Unsupported encoding: "%s"' % header['encoding']) # Write the data in chunks (see _WRITE_CHUNKSIZE declaration for more information why) + # Obtain the length of the data since we will be using it repeatedly, more efficient start_index = 0 + raw_data_len = len(raw_data) # Loop through the data and write it by chunk - while start_index < len(raw_data): + while start_index < raw_data_len: # End index is start index plus the chunk size - end_index = start_index + _WRITE_CHUNKSIZE - - # If the end index is past the data size, then clamp it to the data size - if end_index > len(raw_data): - end_index = len(raw_data) + # Set to the string length to read the remaining chunk at the end + end_index = min(start_index + _WRITE_CHUNKSIZE, raw_data_len) # Write the compressed data fh.write(compressobj.compress(raw_data[start_index:end_index]))