mhe · addisonElliott · Apr 2, 2019 · Mar 30, 2019 · Mar 30, 2019
diff --git a/nrrd/reader.py b/nrrd/reader.py
@@ -8,11 +8,11 @@
 
 from nrrd.parsers import *
 
-# Reading and writing gzipped data directly gives problems when the uncompressed
-# data is larger than 4GB (2^32). Therefore we'll read and write the data in
-# chunks. How this affects speed and/or memory usage is something to be analyzed
-# further. The following two values define the size of the chunks.
-_READ_CHUNKSIZE = 2 ** 20
+# Older versions of Python had issues when uncompressed data was larger than 4GB (2^32). This should be fixed in latest
+# version of Python 2.7 and all versions of Python 3. The fix for this issue is to read the data in smaller chunks.
+# Chunk size is set to be large at 1GB to improve performance. If issues arise decompressing larger files, try to reduce
+# this value
+_READ_CHUNKSIZE = 2 ** 32
 
 _NRRD_REQUIRED_FIELDS = ['dimension', 'type', 'encoding', 'sizes']
 
@@ -369,7 +369,7 @@ def read_data(header, fh=None, filename=None):
             fh.readline()
     else:
         raise NRRDError('Invalid lineskip, allowed values are greater than or equal to 0')
-        
+
     # Skip the requested number of bytes or seek backward, and then parse the data using NumPy
     if byte_skip < -1:
         raise NRRDError('Invalid byteskip, allowed values are greater than or equal to -1')
@@ -380,9 +380,9 @@ def read_data(header, fh=None, filename=None):
     else:
         # The only case left should be: byte_skip == -1 and header['encoding'] == 'gzip'
         byte_skip = -dtype.itemsize * total_data_points
-        
+
     # If a compression encoding is used, then byte skip AFTER decompressing
-    if header['encoding'] == 'raw':             
+    if header['encoding'] == 'raw':
         data = np.fromfile(fh, dtype)
     elif header['encoding'] in ['ASCII', 'ascii', 'text', 'txt']:
         data = np.fromfile(fh, dtype, sep=' ')
@@ -397,20 +397,33 @@ def read_data(header, fh=None, filename=None):
             raise NRRDError('Unsupported encoding: "%s"' % header['encoding'])
 
         # Loop through the file and read a chunk at a time (see _READ_CHUNKSIZE why it is read in chunks)
-        decompressed_data = b''
-        while True:
-            chunk = fh.read(_READ_CHUNKSIZE)
+        decompressed_data = bytearray()
+
+        # Read all of the remaining data from the file
+        # Obtain the length of the compressed data since we will be using it repeatedly, more efficient
+        compressed_data = fh.read()
+        compressed_data_len = len(compressed_data)
+        start_index = 0
+
+        # Loop through data and decompress it chunk by chunk
+        while start_index < compressed_data_len:
+            # Calculate the end index = start index plus chunk size
+            # Set to the string length to read the remaining chunk at the end
+            end_index = min(start_index + _READ_CHUNKSIZE, compressed_data_len)
+
+            # Decompress and append data
+            decompressed_data += decompobj.decompress(compressed_data[start_index:end_index])
 
-            # If chunk is None, then file is at end, break out of loop
-            if not chunk:
-                break
+            # Update start index
+            start_index = end_index
 
-            # Decompress the data and add it to the decompressed data
-            decompressed_data += decompobj.decompress(chunk)
+        # Delete the compressed data since we do not need it anymore
+        # This could potentially be using a lot of memory
+        del compressed_data
 
         # Byte skip is applied AFTER the decompression. Skip first x bytes of the decompressed data and parse it using
         # NumPy
-        data = np.fromstring(decompressed_data[byte_skip:], dtype)
+        data = np.frombuffer(decompressed_data[byte_skip:], dtype)
 
     # Close the file
     # Even if opened using with keyword, closing it does not hurt

diff --git a/nrrd/writer.py b/nrrd/writer.py
@@ -8,10 +8,10 @@
 from nrrd.formatters import *
 from nrrd.reader import _get_field_type
 
-# Reading and writing gzipped data directly gives problems when the uncompressed
-# data is larger than 4GB (2^32). Therefore we'll read and write the data in
-# chunks. How this affects speed and/or memory usage is something to be analyzed
-# further. The following two values define the size of the chunks.
+# Older versions of Python had issues when uncompressed data was larger than 4GB (2^32). This should be fixed in latest
+# version of Python 2.7 and all versions of Python 3. The fix for this issue is to read the data in smaller chunks. The
+# chunk size is set to be small here at 1MB since performance did not vary much based on the chunk size. A smaller chunk
+# size has the benefit of using less RAM at once.
 _WRITE_CHUNKSIZE = 2 ** 20
 
 _NRRD_FIELD_ORDER = [
@@ -67,6 +67,7 @@
     'B': 'big'
 }
 
+
 def _format_field_value(value, field_type):
     if field_type == 'int':
         return format_number(value)
@@ -94,7 +95,7 @@ def _format_field_value(value, field_type):
 
 
 def write(filename, data, header=None, detached_header=False, relative_data_path=True, custom_field_map=None,
-                          compression_level=9):
+          compression_level=9):
     """Write :class:`numpy.ndarray` to NRRD file
 
     The :obj:`filename` parameter specifies the absolute or relative filename to write the NRRD file to. If the
@@ -286,16 +287,15 @@ def _write_data(data, fh, header, compression_level=None):
             raise NRRDError('Unsupported encoding: "%s"' % header['encoding'])
 
         # Write the data in chunks (see _WRITE_CHUNKSIZE declaration for more information why)
+        # Obtain the length of the data since we will be using it repeatedly, more efficient
         start_index = 0
+        raw_data_len = len(raw_data)
 
         # Loop through the data and write it by chunk
-        while start_index < len(raw_data):
+        while start_index < raw_data_len:
             # End index is start index plus the chunk size
-            end_index = start_index + _WRITE_CHUNKSIZE
-
-            # If the end index is past the data size, then clamp it to the data size
-            if end_index > len(raw_data):
-                end_index = len(raw_data)
+            # Set to the string length to read the remaining chunk at the end
+            end_index = min(start_index + _WRITE_CHUNKSIZE, raw_data_len)
 
             # Write the compressed data
             fh.write(compressobj.compress(raw_data[start_index:end_index]))