From 729cd01f6d951e90e3d6a6da11e953b6996ef308 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 19 Feb 2020 10:45:37 +0000 Subject: [PATCH 1/5] First version dask-proxy shortcut code. --- iris_grib/__init__.py | 67 ++++++++++++++++++++++++++++++++++++++----- iris_grib/message.py | 49 +++++++++++++++++-------------- 2 files changed, 88 insertions(+), 28 deletions(-) diff --git a/iris_grib/__init__.py b/iris_grib/__init__.py index dcc32799..2bd0bdd1 100644 --- a/iris_grib/__init__.py +++ b/iris_grib/__init__.py @@ -81,6 +81,53 @@ unknown_string = "???" +def _fake_empty_getitem(keys, shape, dtype): + """ + Detect certain cases where an array slicing will yield no data. + + Args: + + * keys (indexing key, or tuple of keys): + The argument(s) from an array __getitem__ call. + * shape (tuple of int): + The shape of the array being indexed. + * dtype (numpy.dtype): + The dtype of the array being indexed. + + Returns: + result (np.ndarray or None): + If 'keys' contains a slice(0, 0), this is an ndarray of the correct + shape and provided dtype. + Otherwise it is None. + + Note: this is used to avoid Proxy array objects wrapped as Dask arrays + from fetching their file data in these cases. + This is because, for Dask >= 2.0, "dask.array.from_array" does a + fetch like [0:0, 0:0, ...], to 'snapshot' the array metadata. + This lets us avoid fetching the file data in those cases, as none of it is + then used. + + """ + # Convert a single key to a 1-tuple, for empty-slice testing. + if isinstance(keys, tuple): + keys_tuple = keys + else: + keys_tuple = (keys,) + + if any(key == slice(0, 0) for key in keys_tuple): + # When an 'empty' slice is present, return a 'fake' array instead. + target_shape = list(shape) + for i_dim, key in enumerate(keys_tuple): + if key == slice(0, 0): + target_shape[i_dim] = 0 + result = np.zeros((1,), dtype=dtype) + result = np.broadcast_to(result, target_shape) + else: + result = None + + return result + + class GribDataProxy: """A reference to the data payload of a single Grib message.""" @@ -97,13 +144,19 @@ def ndim(self): return len(self.shape) def __getitem__(self, keys): - with open(self.path, 'rb') as grib_fh: - grib_fh.seek(self.offset) - grib_message = gribapi.grib_new_from_file(grib_fh) - data = _message_values(grib_message, self.shape) - gribapi.grib_release(grib_message) - - return data.__getitem__(keys) + # Avoid fetching file data just to return an 'empty' result. + # Needed because of how dask.array.from_array behaves since Dask v2.0. + result = _fake_empty_getitem(keys, self.shape, self.dtype) + if result is None: + with open(self.path, 'rb') as grib_fh: + grib_fh.seek(self.offset) + grib_message = gribapi.grib_new_from_file(grib_fh) + data = _message_values(grib_message, self.shape) + gribapi.grib_release(grib_message) + + result = data.__getitem__(keys) + + return result def __repr__(self): msg = '<{self.__class__.__name__} shape={self.shape} ' \ diff --git a/iris_grib/message.py b/iris_grib/message.py index 7f2b8598..335a5b18 100644 --- a/iris_grib/message.py +++ b/iris_grib/message.py @@ -12,6 +12,7 @@ import re import gribapi +from iris_grib import _fake_empty_getitem as fake_empty_getitem import numpy as np import numpy.ma as ma @@ -228,29 +229,35 @@ def _bitmap(self, bitmap_section): def __getitem__(self, keys): # NB. Currently assumes that the validity of this interpretation # is checked before this proxy is created. - message = self.recreate_raw() - sections = message.sections - bitmap_section = sections[6] - bitmap = self._bitmap(bitmap_section) - data = sections[7]['codedValues'] - - if bitmap is not None: - # Note that bitmap and data are both 1D arrays at this point. - if np.count_nonzero(bitmap) == data.shape[0]: - # Only the non-masked values are included in codedValues. - _data = np.empty(shape=bitmap.shape) - _data[bitmap.astype(bool)] = data - # `ma.masked_array` masks where input = 1, the opposite of - # the behaviour specified by the GRIB spec. - data = ma.masked_array(_data, mask=np.logical_not(bitmap), - fill_value=np.nan) - else: - msg = 'Shapes of data and bitmap do not match.' - raise TranslationError(msg) - data = data.reshape(self.shape) + # Avoid fetching file data just to return an 'empty' result. + # Needed because of how dask.array.from_array behaves since Dask v2.0. + result = fake_empty_getitem(keys=keys, shape=self.shape, self.dtype) + if result is None: + message = self.recreate_raw() + sections = message.sections + bitmap_section = sections[6] + bitmap = self._bitmap(bitmap_section) + data = sections[7]['codedValues'] + + if bitmap is not None: + # Note that bitmap and data are both 1D arrays at this point. + if np.count_nonzero(bitmap) == data.shape[0]: + # Only the non-masked values are included in codedValues. + _data = np.empty(shape=bitmap.shape) + _data[bitmap.astype(bool)] = data + # `ma.masked_array` masks where input = 1, the opposite of + # the behaviour specified by the GRIB spec. + data = ma.masked_array(_data, mask=np.logical_not(bitmap), + fill_value=np.nan) + else: + msg = 'Shapes of data and bitmap do not match.' + raise TranslationError(msg) + + data = data.reshape(self.shape) + result = data.__getitem__(keys) - return data.__getitem__(keys) + return result def __repr__(self): msg = '<{self.__class__.__name__} shape={self.shape} ' \ From ff494f92b0b5cf40ecf56ed2df732048179b80e7 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 19 Feb 2020 18:45:25 +0000 Subject: [PATCH 2/5] Replace routine with iris version and add a no-fetch test. --- iris_grib/__init__.py | 117 +++++++++++------- iris_grib/message.py | 4 +- .../tests/unit/message/test__DataProxy.py | 28 +++++ 3 files changed, 99 insertions(+), 50 deletions(-) diff --git a/iris_grib/__init__.py b/iris_grib/__init__.py index 2bd0bdd1..43e999c7 100644 --- a/iris_grib/__init__.py +++ b/iris_grib/__init__.py @@ -25,6 +25,74 @@ import iris.coord_systems as coord_systems from iris.exceptions import TranslationError, NotYetImplementedError +try: + from iris.util import _array_slice_ifempty +except ImportError: + # A temporary cut-and-paste hack, until this is in an Iris release ... + # see : https://github.com/SciTools/iris/pull/3659 + def _array_slice_ifempty(keys, shape, dtype): + """ + Detect cases where an array slice will contain no data, as it contains + a zero-length dimension, and produce an equivalent result for those + cases. + + The function indicates 'empty' slicing cases, by returning an array + equal to the slice result in those cases. + + Args: + + * keys (indexing key, or tuple of keys): + The argument from an array __getitem__ call. + Only tuples of integers and slices are supported, in particular no + newaxis, ellipsis or array keys. + These are the types of array access usage we expect from Dask. + * shape (tuple of int): + The shape of the array being indexed. + * dtype (numpy.dtype): + The dtype of the array being indexed. + + Returns: + result (np.ndarray or None): + If 'keys' contains a slice(0, 0), this is an ndarray of the + correct resulting shape and provided dtype. + Otherwise it is None. + + .. note:: + + This is used to prevent DataProxy arraylike objects from fetching + their file data when wrapped as Dask arrays. + This is because, for Dask >= 2.0, the "dask.array.from_array" call + performs a fetch like [0:0, 0:0, ...], to snapshot array metadata. + This function enables us to avoid triggering a file data fetch in + those cases : This is consistent because the result will not + contain any actual data content. + + """ + # Convert single key into a 1-tuple, so we always have a tuple of keys. + if isinstance(keys, tuple): + keys_tuple = keys + else: + keys_tuple = (keys,) + + if any(key == slice(0, 0) for key in keys_tuple): + # An 'empty' slice is present : Return a 'fake' array instead. + target_shape = list(shape) + for i_dim, key in enumerate(keys_tuple): + if key == slice(0, 0): + # Reduce dims with empty slicing to length 0. + target_shape[i_dim] = 0 + # Create a prototype result : no memory usage, as some dims are 0. + result = np.zeros(target_shape, dtype=dtype) + # Index with original keys to produce the desired result shape. + # Note : also ok in 0-length dims, as the slice is always '0:0'. + result = result[keys] + else: + result = None + + return result + + + # NOTE: careful here, to avoid circular imports (as iris imports grib) from . import grib_phenom_translation as gptx from . import _save_rules @@ -81,53 +149,6 @@ unknown_string = "???" -def _fake_empty_getitem(keys, shape, dtype): - """ - Detect certain cases where an array slicing will yield no data. - - Args: - - * keys (indexing key, or tuple of keys): - The argument(s) from an array __getitem__ call. - * shape (tuple of int): - The shape of the array being indexed. - * dtype (numpy.dtype): - The dtype of the array being indexed. - - Returns: - result (np.ndarray or None): - If 'keys' contains a slice(0, 0), this is an ndarray of the correct - shape and provided dtype. - Otherwise it is None. - - Note: this is used to avoid Proxy array objects wrapped as Dask arrays - from fetching their file data in these cases. - This is because, for Dask >= 2.0, "dask.array.from_array" does a - fetch like [0:0, 0:0, ...], to 'snapshot' the array metadata. - This lets us avoid fetching the file data in those cases, as none of it is - then used. - - """ - # Convert a single key to a 1-tuple, for empty-slice testing. - if isinstance(keys, tuple): - keys_tuple = keys - else: - keys_tuple = (keys,) - - if any(key == slice(0, 0) for key in keys_tuple): - # When an 'empty' slice is present, return a 'fake' array instead. - target_shape = list(shape) - for i_dim, key in enumerate(keys_tuple): - if key == slice(0, 0): - target_shape[i_dim] = 0 - result = np.zeros((1,), dtype=dtype) - result = np.broadcast_to(result, target_shape) - else: - result = None - - return result - - class GribDataProxy: """A reference to the data payload of a single Grib message.""" @@ -146,7 +167,7 @@ def ndim(self): def __getitem__(self, keys): # Avoid fetching file data just to return an 'empty' result. # Needed because of how dask.array.from_array behaves since Dask v2.0. - result = _fake_empty_getitem(keys, self.shape, self.dtype) + result = _array_slice_ifempty(keys, self.shape, self.dtype) if result is None: with open(self.path, 'rb') as grib_fh: grib_fh.seek(self.offset) diff --git a/iris_grib/message.py b/iris_grib/message.py index 335a5b18..06642f24 100644 --- a/iris_grib/message.py +++ b/iris_grib/message.py @@ -12,7 +12,7 @@ import re import gribapi -from iris_grib import _fake_empty_getitem as fake_empty_getitem +from iris_grib import _array_slice_ifempty import numpy as np import numpy.ma as ma @@ -232,7 +232,7 @@ def __getitem__(self, keys): # Avoid fetching file data just to return an 'empty' result. # Needed because of how dask.array.from_array behaves since Dask v2.0. - result = fake_empty_getitem(keys=keys, shape=self.shape, self.dtype) + result = _array_slice_ifempty(keys, self.shape, self.dtype) if result is None: message = self.recreate_raw() sections = message.sections diff --git a/iris_grib/tests/unit/message/test__DataProxy.py b/iris_grib/tests/unit/message/test__DataProxy.py index 3b6dc6a3..73cfeb35 100644 --- a/iris_grib/tests/unit/message/test__DataProxy.py +++ b/iris_grib/tests/unit/message/test__DataProxy.py @@ -12,6 +12,8 @@ # importing anything else. import iris_grib.tests as tests +from unittest import mock + import numpy as np from numpy.random import randint @@ -41,5 +43,31 @@ def test_bitmap__invalid_indicator(self): data_proxy._bitmap(section_6) +class Test_emptyfetch(tests.IrisGribTest): + # See : + # iris.tests.unit.fileformats.pp.test_PPDataProxy.Test__getitem__slicing + # In this case, test *only* the no-data-read effect, not the method which + # is part of Iris. + def test_empty_slice(self): + # Check behaviour of the getitem call with an 'empty' slicing. + # This is necessary because, since Dask 2.0, the "from_array" function + # takes a zero-length slice of its array argument, to capture array + # metadata, and in those cases we want to avoid file access. + test_dtype = np.dtype(np.float32) + mock_datafetch = mock.MagicMock() + proxy = _DataProxy(shape=(3, 4), + dtype=np.dtype(np.float32), + recreate_raw=mock_datafetch) + + # Test the special no-data indexing operation. + result = proxy[0:0, 0:0] + + # Check the behaviour and results were as expected. + self.assertEqual(mock_datafetch.call_count, 0) + self.assertIsInstance(result, np.ndarray) + self.assertEqual(result.dtype, test_dtype) + self.assertEqual(result.shape, (0, 0)) + + if __name__ == '__main__': tests.main() From faf0a78b33402bda7885e59d00164a51e39da9e2 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 19 Feb 2020 18:56:47 +0000 Subject: [PATCH 3/5] Code style fix. --- iris_grib/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/iris_grib/__init__.py b/iris_grib/__init__.py index 43e999c7..3794e894 100644 --- a/iris_grib/__init__.py +++ b/iris_grib/__init__.py @@ -92,7 +92,6 @@ def _array_slice_ifempty(keys, shape, dtype): return result - # NOTE: careful here, to avoid circular imports (as iris imports grib) from . import grib_phenom_translation as gptx from . import _save_rules From d13802912c32cfa8aaf706b3b7accd9d96e952a7 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 20 Feb 2020 12:07:29 +0000 Subject: [PATCH 4/5] Code style fix. --- iris_grib/tests/unit/message/test__DataProxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iris_grib/tests/unit/message/test__DataProxy.py b/iris_grib/tests/unit/message/test__DataProxy.py index 73cfeb35..d7494d5e 100644 --- a/iris_grib/tests/unit/message/test__DataProxy.py +++ b/iris_grib/tests/unit/message/test__DataProxy.py @@ -44,7 +44,7 @@ def test_bitmap__invalid_indicator(self): class Test_emptyfetch(tests.IrisGribTest): - # See : + # See : # iris.tests.unit.fileformats.pp.test_PPDataProxy.Test__getitem__slicing # In this case, test *only* the no-data-read effect, not the method which # is part of Iris. From e23229f4b7cd48a702e3a42a4cd69d7b8f6b1737 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 21 Feb 2020 10:36:19 +0000 Subject: [PATCH 5/5] Rely on iris.util for _array_slice_ifempty; require iris>=2.4 --- environment.yml | 2 +- iris_grib/__init__.py | 70 ++----------------------------------------- setup.py | 2 +- 3 files changed, 4 insertions(+), 70 deletions(-) diff --git a/environment.yml b/environment.yml index 361dacb2..cf1cb7b9 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ channels: - conda-forge dependencies: - - iris>=2 + - iris>=2.4 - python-eccodes>=0.9.1,<2 - pep8 diff --git a/iris_grib/__init__.py b/iris_grib/__init__.py index 3794e894..9e4a6e6b 100644 --- a/iris_grib/__init__.py +++ b/iris_grib/__init__.py @@ -20,79 +20,13 @@ import numpy as np import numpy.ma as ma +# NOTE: careful here, to avoid circular imports (as iris imports grib) import iris from iris._lazy_data import as_lazy_data import iris.coord_systems as coord_systems from iris.exceptions import TranslationError, NotYetImplementedError +from iris.util import _array_slice_ifempty -try: - from iris.util import _array_slice_ifempty -except ImportError: - # A temporary cut-and-paste hack, until this is in an Iris release ... - # see : https://github.com/SciTools/iris/pull/3659 - def _array_slice_ifempty(keys, shape, dtype): - """ - Detect cases where an array slice will contain no data, as it contains - a zero-length dimension, and produce an equivalent result for those - cases. - - The function indicates 'empty' slicing cases, by returning an array - equal to the slice result in those cases. - - Args: - - * keys (indexing key, or tuple of keys): - The argument from an array __getitem__ call. - Only tuples of integers and slices are supported, in particular no - newaxis, ellipsis or array keys. - These are the types of array access usage we expect from Dask. - * shape (tuple of int): - The shape of the array being indexed. - * dtype (numpy.dtype): - The dtype of the array being indexed. - - Returns: - result (np.ndarray or None): - If 'keys' contains a slice(0, 0), this is an ndarray of the - correct resulting shape and provided dtype. - Otherwise it is None. - - .. note:: - - This is used to prevent DataProxy arraylike objects from fetching - their file data when wrapped as Dask arrays. - This is because, for Dask >= 2.0, the "dask.array.from_array" call - performs a fetch like [0:0, 0:0, ...], to snapshot array metadata. - This function enables us to avoid triggering a file data fetch in - those cases : This is consistent because the result will not - contain any actual data content. - - """ - # Convert single key into a 1-tuple, so we always have a tuple of keys. - if isinstance(keys, tuple): - keys_tuple = keys - else: - keys_tuple = (keys,) - - if any(key == slice(0, 0) for key in keys_tuple): - # An 'empty' slice is present : Return a 'fake' array instead. - target_shape = list(shape) - for i_dim, key in enumerate(keys_tuple): - if key == slice(0, 0): - # Reduce dims with empty slicing to length 0. - target_shape[i_dim] = 0 - # Create a prototype result : no memory usage, as some dims are 0. - result = np.zeros(target_shape, dtype=dtype) - # Index with original keys to produce the desired result shape. - # Note : also ok in 0-length dims, as the slice is always '0:0'. - result = result[keys] - else: - result = None - - return result - - -# NOTE: careful here, to avoid circular imports (as iris imports grib) from . import grib_phenom_translation as gptx from . import _save_rules from ._load_convert import convert as load_convert diff --git a/setup.py b/setup.py index b36ed2a9..5db3255b 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,7 @@ def file_walk_relative(top, remove=''): # NOTE: The Python 3 bindings to eccodes (eccodes-python) is available on # PyPI, but the user is required to install eccodes itself manually. See # ECMWF ecCodes installation documentation for more information. - install_requires=['scitools-iris>=2.0.*'] + ['eccodes-python'], + install_requires=['scitools-iris>=2.4.*'] + ['eccodes-python'], test_suite = 'iris_grib.tests', )