From dceb2983b7840b9b7e5c47fdbe9f13961461a9a2 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Fri, 9 Feb 2018 10:01:05 +0900 Subject: [PATCH 01/32] Start working --- xarray/core/indexing.py | 44 ++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e06b045ad88..cf02d7a7f81 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -485,13 +485,6 @@ def __init__(self, array, key=None): self.key = key def _updated_key(self, new_key): - # TODO should suport VectorizedIndexer - if isinstance(new_key, VectorizedIndexer): - raise NotImplementedError( - 'Vectorized indexing for {} is not implemented. Load your ' - 'data first with .load() or .compute(), or disable caching by ' - 'setting cache=False in open_dataset.'.format(type(self))) - iter_new_key = iter(expanded_indexer(new_key.tuple, self.ndim)) full_key = [] for size, k in zip(self.array.shape, self.key.tuple): @@ -520,6 +513,8 @@ def __array__(self, dtype=None): return np.asarray(array[self.key], dtype=None) def __getitem__(self, indexer): + if isinstance(indexer, VectorizedIndexer): + return type(self)(LazilyVectorizedIndexedArray(self, indexer)) return type(self)(self.array, self._updated_key(indexer)) def __setitem__(self, key, value): @@ -531,6 +526,41 @@ def __repr__(self): (type(self).__name__, self.array, self.key)) +class LazilyVectorizedIndexedArray(ExplicitlyIndexedNDArrayMixin): + """Wrap an array to make vectorized indexing lazy. + """ + + def __init__(self, array, key): + """ + Parameters + ---------- + array : array_like + Array like object to index. + key : VectorizedIndexer + """ + self.array = as_indexable(array) + self.key = key + raise NotImplementedError + # TODO compute shape from array.shape and key + self.shape = None + + def __array__(self, dtype=None): + array = as_indexable(self.array) + return np.asarray(array[self.key], dtype=None) + + def __getitem__(self, indexer): + return type(self)(self, indexer) + + def __setitem__(self, key, value): + raise NotImplementedError + full_key = self._updated_key(key) + self.array[full_key] = value + + def __repr__(self): + return ('%s(array=%r, key=%r)' % + (type(self).__name__, self.array, self.key)) + + def _wrap_numpy_scalars(array): """Wrap NumPy scalars in 0d arrays.""" if np.isscalar(array): From c1b4b60e7a15cc8f7513fc9b97e9f08501d930ea Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 9 Feb 2018 20:00:59 +0900 Subject: [PATCH 02/32] First support of lazy vectorized indexing. --- doc/whats-new.rst | 2 ++ xarray/core/indexing.py | 49 ++++++++++++++++++++++++++++++----- xarray/tests/test_backends.py | 34 +++++------------------- xarray/tests/test_indexing.py | 45 ++++++++++++++++++++++++++++++++ xarray/tests/test_variable.py | 6 ++--- 5 files changed, 98 insertions(+), 38 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f17bb8f0d49..f2172810364 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,6 +43,8 @@ Documentation Enhancements ~~~~~~~~~~~~ +- Lazy vectorized-indexing support. (:issue:`1897`) + By `Keisuke Fujii `_. - reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` arguments. (:issue:`1838`) By `Keisuke Fujii `_. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index cf02d7a7f81..6a4c0a16914 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -415,6 +415,22 @@ def __init__(self, key): super(VectorizedIndexer, self).__init__(new_key) + def infer_shape_of(self, shape): + """ Infer the shape of the array after indexing. """ + vindexes = [k for k in self._key if isinstance(k, np.ndarray)] + if len(vindexes) != 0: + vindex_shape = np.broadcast(*vindexes).shape + else: + vindex_shape = () + + def slice_size(sl, size): + ind = sl.indices(size) + return (ind[1] - ind[0] + ind[2] - 1) // ind[2] + + slice_shape = tuple([slice_size(k, s) for (k, s) in zip(self._key, shape) + if isinstance(k, slice)]) + return vindex_shape + slice_shape + class ExplicitlyIndexed(object): """Mixin to mark support for Indexer subclasses in indexing.""" @@ -508,6 +524,12 @@ def shape(self): shape.append(k.size) return tuple(shape) + def transpose(self, order): + if isinstance(self.array, LazilyVectorizedIndexedArray): + return type(self)(self.array.transpose(order)) + raise NotImplementedError( + 'transpose can only be used for vectorized-indexing.') + def __array__(self, dtype=None): array = as_indexable(self.array) return np.asarray(array[self.key], dtype=None) @@ -518,6 +540,10 @@ def __getitem__(self, indexer): return type(self)(self.array, self._updated_key(indexer)) def __setitem__(self, key, value): + if isinstance(key, VectorizedIndexer): + raise NotImplementedError( + 'Lazy item assignment with the vectorized indexer is not yet ' + 'implemented. Load your data first by .load() or compute().') full_key = self._updated_key(key) self.array[full_key] = value @@ -540,21 +566,30 @@ def __init__(self, array, key): """ self.array = as_indexable(array) self.key = key - raise NotImplementedError - # TODO compute shape from array.shape and key - self.shape = None + self._shape = key.infer_shape_of(self.array.shape) + self.order = None + + @property + def shape(self): + return self._shape + + def transpose(self, order): + if self.order is None: + self.order = order + else: + self.order = np.array(self.order)[order] + return self def __array__(self, dtype=None): array = as_indexable(self.array) - return np.asarray(array[self.key], dtype=None) + return np.asarray(array, dtype=None)[self.key].transpose(*self.order) def __getitem__(self, indexer): - return type(self)(self, indexer) + array = as_indexable(self.array) + return np.asarray(array, dtype=None)[self.key.tuple][indexer.tuple] def __setitem__(self, key, value): raise NotImplementedError - full_key = self._updated_key(key) - self.array[full_key] = value def __repr__(self): return ('%s(array=%r, key=%r)' % diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 85b6bdea346..c4b5919e341 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -411,7 +411,7 @@ def test_orthogonal_indexing(self): actual = on_disk.isel(**indexers) assert_identical(expected, actual) - def _test_vectorized_indexing(self, vindex_support=True): + def test_vectorized_indexing(self): # Make sure vectorized_indexing works or at least raises # NotImplementedError in_memory = create_test_data() @@ -419,20 +419,12 @@ def _test_vectorized_indexing(self, vindex_support=True): indexers = {'dim1': DataArray([0, 2, 0], dims='a'), 'dim2': DataArray([0, 2, 3], dims='a')} expected = in_memory.isel(**indexers) - if vindex_support: - actual = on_disk.isel(**indexers) - assert_identical(expected, actual) - # do it twice, to make sure we're switched from - # orthogonal -> numpy when we cached the values - actual = on_disk.isel(**indexers) - assert_identical(expected, actual) - else: - with raises_regex(NotImplementedError, 'Vectorized indexing '): - actual = on_disk.isel(**indexers) - - def test_vectorized_indexing(self): - # This test should be overwritten if vindex is supported - self._test_vectorized_indexing(vindex_support=False) + actual = on_disk.isel(**indexers) + assert_identical(expected, actual) + # do it twice, to make sure we're switched from + # orthogonal -> numpy when we cached the values + actual = on_disk.isel(**indexers) + assert_identical(expected, actual) def test_isel_dataarray(self): # Make sure isel works lazily. GH:issue:1688 @@ -738,9 +730,6 @@ def test_append_with_invalid_dim_raises(self): 'Unable to update size for existing dimension'): self.save(data, tmp_file, mode='a') - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=False) - def test_multiindex_not_implemented(self): ds = (Dataset(coords={'y': ('x', [1, 2]), 'z': ('x', ['a', 'b'])}) .set_index(x=['y', 'z'])) @@ -1101,9 +1090,6 @@ def test_dataset_caching(self): # caching behavior differs for dask pass - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=True) - class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): autoclose = True @@ -1220,9 +1206,6 @@ def test_chunk_encoding_with_dask(self): with self.roundtrip(ds_chunk4) as actual: pass - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=True) - def test_hidden_zarr_keys(self): expected = create_test_data() with self.create_store() as store: @@ -2031,9 +2014,6 @@ def test_dataarray_compute(self): self.assertTrue(computed._in_memory) assert_allclose(actual, computed, decode_bytes=False) - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=True) - class DaskTestAutocloseTrue(DaskTest): autoclose = True diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 3d93afb26d4..3863122379f 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -196,6 +196,39 @@ def test_lazily_indexed_array(self): assert isinstance(actual._data.array, indexing.NumpyIndexingAdapter) + def test_vectorized_lazily_indexed_array(self): + original = np.random.rand(10, 20, 30) + x = indexing.NumpyIndexingAdapter(original) + lazy = indexing.LazilyIndexedArray(x) + v_lazy = Variable(['i', 'j', 'k'], lazy) + I = ReturnItem() # noqa: E741 # allow ambiguous name + + def check_indexing(va, indexers): + for indexer in indexers: + actual = va[indexer] + indexer = tuple(getattr(ind, 'data', ind) for ind in indexer) + expected = np.asarray(va)[indexer] + assert expected.shape == actual.shape + assert isinstance(actual._data, indexing.LazilyIndexedArray) + assert_array_equal(expected, actual) + va = actual + + # test orthogonal indexing + indexers = [(I[:], 0, 1), (Variable('i', [0, 1]), )] + check_indexing(v_lazy, indexers) + + # vectorized indexing + indexers = [ + (Variable('i', [0, 1]), Variable('i', [0, 1]), slice(None)), + (slice(1, 3, 2), 0)] + check_indexing(v_lazy, indexers) + + indexers = [ + (slice(None, None, 2), 0, slice(None, 10)), + (Variable('i', [3, 2, 4]), Variable('i', [3, 2, 1])), + (slice(0, -1, 2), )] + check_indexing(v_lazy, indexers) + class TestCopyOnWriteArray(TestCase): def test_setitem(self): @@ -334,6 +367,18 @@ def test_vectorized_indexer(): np.arange(5, dtype=np.int64))) +def test_vectorized_indexer_infer_shape_of(): + data = indexing.NumpyIndexingAdapter(np.random.randn(10, 12, 13)) + indexers = [np.array([[0, 3, 2], ]), np.array([[0, 3, 3], [4, 6, 7]]), + slice(2, -2, 2), slice(2, -2, 3), slice(None)] + + for i, j, k in itertools.product(indexers, repeat=3): + vindex = indexing.VectorizedIndexer((i, j, k)) + expected = data[vindex].shape + actual = vindex.infer_shape_of(data.shape) + assert expected == actual + + def test_unwrap_explicit_indexer(): indexer = indexing.BasicIndexer((1, 2)) target = None diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 5a89627a0f9..802b1be3203 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1893,8 +1893,7 @@ def test_NumpyIndexingAdapter(self): def test_LazilyIndexedArray(self): v = Variable(dims=('x', 'y'), data=LazilyIndexedArray(self.d)) self.check_orthogonal_indexing(v) - with raises_regex(NotImplementedError, 'Vectorized indexing for '): - self.check_vectorized_indexing(v) + self.check_vectorized_indexing(v) # doubly wrapping v = Variable(dims=('x', 'y'), data=LazilyIndexedArray(LazilyIndexedArray(self.d))) @@ -1912,8 +1911,7 @@ def test_CopyOnWriteArray(self): v = Variable(dims=('x', 'y'), data=CopyOnWriteArray(LazilyIndexedArray(self.d))) self.check_orthogonal_indexing(v) - with raises_regex(NotImplementedError, 'Vectorized indexing for '): - self.check_vectorized_indexing(v) + self.check_vectorized_indexing(v) def test_MemoryCachedArray(self): v = Variable(dims=('x', 'y'), data=MemoryCachedArray(self.d)) From d989a150d243da3715318dd09db3c77834ed99d7 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Fri, 9 Feb 2018 23:01:08 +0900 Subject: [PATCH 03/32] Some optimization. --- xarray/core/indexing.py | 44 +++++++++++++++++++++++------------ xarray/tests/test_indexing.py | 6 ++++- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 6a4c0a16914..7a4fa0b3c11 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -427,10 +427,27 @@ def slice_size(sl, size): ind = sl.indices(size) return (ind[1] - ind[0] + ind[2] - 1) // ind[2] - slice_shape = tuple([slice_size(k, s) for (k, s) in zip(self._key, shape) - if isinstance(k, slice)]) + slice_shape = tuple([slice_size(k, s) for (k, s) in + zip(self._key, shape) if isinstance(k, slice)]) return vindex_shape + slice_shape + def decompose(self, shape): + """ Decompose vectorized indexer to outer and vectorized indexers, + array[self] == array[oindex][vindex] + such that array[oindex].shape becomes smallest. + """ + oindex = [] + vindex = [] + for k, s in zip(self._key, shape): + if isinstance(k, slice): + oindex.append(k) + vindex.append(slice(None)) + else: # np.ndarray + k = np.where(k < 0, s - k, k) + oindex.append(slice(np.min(k), np.max(k) + 1)) + vindex.append(k - np.min(k)) + return OuterIndexer(tuple(oindex)), VectorizedIndexer(tuple(vindex)) + class ExplicitlyIndexed(object): """Mixin to mark support for Indexer subclasses in indexing.""" @@ -527,8 +544,7 @@ def shape(self): def transpose(self, order): if isinstance(self.array, LazilyVectorizedIndexedArray): return type(self)(self.array.transpose(order)) - raise NotImplementedError( - 'transpose can only be used for vectorized-indexing.') + raise AttributeError def __array__(self, dtype=None): array = as_indexable(self.array) @@ -564,29 +580,27 @@ def __init__(self, array, key): Array like object to index. key : VectorizedIndexer """ - self.array = as_indexable(array) - self.key = key - self._shape = key.infer_shape_of(self.array.shape) - self.order = None + oindex, vindex = key.decompose(array.shape) + self.array = as_indexable(array)[oindex] + self.key = vindex + self._shape = self.key.infer_shape_of(self.array.shape) + self.order = np.arange(self.ndim) @property def shape(self): return self._shape def transpose(self, order): - if self.order is None: - self.order = order - else: - self.order = np.array(self.order)[order] + self.order = np.array(self.order)[order] return self def __array__(self, dtype=None): array = as_indexable(self.array) - return np.asarray(array, dtype=None)[self.key].transpose(*self.order) + return np.asarray(array, dtype=None)[self.key.tuple].transpose(*self.order) def __getitem__(self, indexer): - array = as_indexable(self.array) - return np.asarray(array, dtype=None)[self.key.tuple][indexer.tuple] + # note this is not lazy + return np.asarray(self)[indexer.tuple] def __setitem__(self, key, value): raise NotImplementedError diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 3863122379f..ff7e70d9f96 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -367,7 +367,7 @@ def test_vectorized_indexer(): np.arange(5, dtype=np.int64))) -def test_vectorized_indexer_infer_shape_of(): +def test_vectorized_indexer_utils(): data = indexing.NumpyIndexingAdapter(np.random.randn(10, 12, 13)) indexers = [np.array([[0, 3, 2], ]), np.array([[0, 3, 3], [4, 6, 7]]), slice(2, -2, 2), slice(2, -2, 3), slice(None)] @@ -378,6 +378,10 @@ def test_vectorized_indexer_infer_shape_of(): actual = vindex.infer_shape_of(data.shape) assert expected == actual + oind, vind = vindex.decompose(data.shape) + np.testing.assert_array_equal( + data[vindex], indexing.NumpyIndexingAdapter(data[oind])[vind]) + def test_unwrap_explicit_indexer(): indexer = indexing.BasicIndexer((1, 2)) From 218763c2176277c5c5ae4c2e7f7a64b5c42620f6 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 10 Feb 2018 13:42:41 +0900 Subject: [PATCH 04/32] Use unique to decompose vectorized indexing. --- xarray/core/indexing.py | 12 ++++++------ xarray/tests/test_indexing.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 7a4fa0b3c11..e308a69113a 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -431,21 +431,21 @@ def slice_size(sl, size): zip(self._key, shape) if isinstance(k, slice)]) return vindex_shape + slice_shape - def decompose(self, shape): + def decompose(self): """ Decompose vectorized indexer to outer and vectorized indexers, array[self] == array[oindex][vindex] such that array[oindex].shape becomes smallest. """ oindex = [] vindex = [] - for k, s in zip(self._key, shape): + for k in self._key: if isinstance(k, slice): oindex.append(k) vindex.append(slice(None)) else: # np.ndarray - k = np.where(k < 0, s - k, k) - oindex.append(slice(np.min(k), np.max(k) + 1)) - vindex.append(k - np.min(k)) + oind, vind = np.unique(k, return_inverse=True) + oindex.append(oind) + vindex.append(vind.reshape(*k.shape)) return OuterIndexer(tuple(oindex)), VectorizedIndexer(tuple(vindex)) @@ -580,7 +580,7 @@ def __init__(self, array, key): Array like object to index. key : VectorizedIndexer """ - oindex, vindex = key.decompose(array.shape) + oindex, vindex = key.decompose() self.array = as_indexable(array)[oindex] self.key = vindex self._shape = self.key.infer_shape_of(self.array.shape) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index ff7e70d9f96..2a5c9ce1e31 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -378,7 +378,7 @@ def test_vectorized_indexer_utils(): actual = vindex.infer_shape_of(data.shape) assert expected == actual - oind, vind = vindex.decompose(data.shape) + oind, vind = vindex.decompose() np.testing.assert_array_equal( data[vindex], indexing.NumpyIndexingAdapter(data[oind])[vind]) From 541fca3f78faee69cd8bf3ca33548e2b457a93fa Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 10 Feb 2018 17:31:03 +0900 Subject: [PATCH 05/32] Consolidate vectorizedIndexing --- xarray/core/indexing.py | 126 +++++++++++++++++++--------------- xarray/tests/test_backends.py | 6 +- xarray/tests/test_indexing.py | 61 +++++++++------- 3 files changed, 109 insertions(+), 84 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e308a69113a..9973e8c5fc8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -415,39 +415,6 @@ def __init__(self, key): super(VectorizedIndexer, self).__init__(new_key) - def infer_shape_of(self, shape): - """ Infer the shape of the array after indexing. """ - vindexes = [k for k in self._key if isinstance(k, np.ndarray)] - if len(vindexes) != 0: - vindex_shape = np.broadcast(*vindexes).shape - else: - vindex_shape = () - - def slice_size(sl, size): - ind = sl.indices(size) - return (ind[1] - ind[0] + ind[2] - 1) // ind[2] - - slice_shape = tuple([slice_size(k, s) for (k, s) in - zip(self._key, shape) if isinstance(k, slice)]) - return vindex_shape + slice_shape - - def decompose(self): - """ Decompose vectorized indexer to outer and vectorized indexers, - array[self] == array[oindex][vindex] - such that array[oindex].shape becomes smallest. - """ - oindex = [] - vindex = [] - for k in self._key: - if isinstance(k, slice): - oindex.append(k) - vindex.append(slice(None)) - else: # np.ndarray - oind, vind = np.unique(k, return_inverse=True) - oindex.append(oind) - vindex.append(vind.reshape(*k.shape)) - return OuterIndexer(tuple(oindex)), VectorizedIndexer(tuple(vindex)) - class ExplicitlyIndexed(object): """Mixin to mark support for Indexer subclasses in indexing.""" @@ -541,18 +508,14 @@ def shape(self): shape.append(k.size) return tuple(shape) - def transpose(self, order): - if isinstance(self.array, LazilyVectorizedIndexedArray): - return type(self)(self.array.transpose(order)) - raise AttributeError - def __array__(self, dtype=None): array = as_indexable(self.array) return np.asarray(array[self.key], dtype=None) def __getitem__(self, indexer): if isinstance(indexer, VectorizedIndexer): - return type(self)(LazilyVectorizedIndexedArray(self, indexer)) + array = LazilyVectorizedIndexedArray(self.array, self.key) + return array[indexer] return type(self)(self.array, self._updated_key(indexer)) def __setitem__(self, key, value): @@ -580,27 +543,39 @@ def __init__(self, array, key): Array like object to index. key : VectorizedIndexer """ - oindex, vindex = key.decompose() - self.array = as_indexable(array)[oindex] - self.key = vindex - self._shape = self.key.infer_shape_of(self.array.shape) + if isinstance(key, (BasicIndexer, OuterIndexer)): + self.key = VectorizedIndexer( + _outer_to_vectorized_indexer(key.tuple, array.shape)) + else: + self.key = _arrayize_vectorized_indexer(key, array.shape) + self.array = as_indexable(array) self.order = np.arange(self.ndim) @property def shape(self): - return self._shape - - def transpose(self, order): - self.order = np.array(self.order)[order] - return self + return np.broadcast(*self.key.tuple).shape def __array__(self, dtype=None): - array = as_indexable(self.array) - return np.asarray(array, dtype=None)[self.key.tuple].transpose(*self.order) + try: + array = np.asarray(self.array[self.key], dtype=None) + except NotImplementedError: + # if vectorized indexing is not supported + oind, vind = _decompose_vectorized_indexer(self.key) + array = NumpyIndexingAdapter(np.asarray(self.array[oind], + dtype=None))[vind] + return array + + def _updated_key(self, new_key): + return VectorizedIndexer( + tuple(o[new_key.tuple] for o in np.broadcast_arrays(*self.key.tuple))) def __getitem__(self, indexer): - # note this is not lazy - return np.asarray(self)[indexer.tuple] + return type(self)(self.array, self._updated_key(indexer)) + + def transpose(self, order): + key = VectorizedIndexer(tuple( + k.transpose(order) for k in self.key.tuple)) + return type(self)(self.array, key) def __setitem__(self, key, value): raise NotImplementedError @@ -681,7 +656,7 @@ def _outer_to_vectorized_indexer(key, shape): Parameters ---------- key : tuple - Tuple from an OuterIndexer to convert. + Tuple from an Basic/OuterIndexer to convert. shape : tuple Shape of the array subject to the indexing. @@ -689,15 +664,15 @@ def _outer_to_vectorized_indexer(key, shape): ------- tuple Tuple suitable for use to index a NumPy array with vectorized indexing. - Each element is an integer or array: broadcasting them together gives - the shape of the result. + Each element is array: broadcasting them together gives the shape of + the result. """ n_dim = len([k for k in key if not isinstance(k, integer_types)]) i_dim = 0 new_key = [] for k, size in zip(key, shape): if isinstance(k, integer_types): - new_key.append(k) + new_key.append(np.array(k).reshape((1,) * n_dim)) else: # np.ndarray or slice if isinstance(k, slice): k = np.arange(*k.indices(size)) @@ -733,6 +708,45 @@ def _outer_to_numpy_indexer(key, shape): return _outer_to_vectorized_indexer(key, shape) +def _decompose_vectorized_indexer(indexer): + """ Decompose vectorized indexer to outer and vectorized indexers, + array[indexer] == array[oindex][vindex] + such that array[oindex].shape becomes smallest. + """ + oindex = [] + vindex = [] + for k in indexer.tuple: + if isinstance(k, slice): + oindex.append(k) + vindex.append(slice(None)) + else: # np.ndarray + oind, vind = np.unique(k, return_inverse=True) + oindex.append(oind) + vindex.append(vind.reshape(*k.shape)) + return OuterIndexer(tuple(oindex)), VectorizedIndexer(tuple(vindex)) + + +def _arrayize_vectorized_indexer(indexer, shape): + """ Return an identical vindex but slices are replaced by arrays """ + slices = [v for v in indexer.tuple if isinstance(v, slice)] + if len(slices) == 0: + return indexer + + arrays = [v for v in indexer.tuple if isinstance(v, np.ndarray)] + n_dim = arrays[0].ndim if len(arrays) > 0 else 0 + i_dim = 0 + new_key = [] + for v, size in zip(indexer.tuple, shape): + if isinstance(v, np.ndarray): + new_key.append(np.reshape(v, v.shape + (1, ) * len(slices))) + else: # slice + shape = ((1,) * (n_dim + i_dim) + (-1,) + + (1,) * (len(slices) - i_dim - 1)) + new_key.append(np.arange(*v.indices(size)).reshape(shape)) + i_dim += 1 + return VectorizedIndexer(tuple(new_key)) + + def _dask_array_with_chunks_hint(array, chunks): """Create a dask array using the chunks hint for dimensions of size > 1.""" import dask.array as da diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c4b5919e341..f7233658fb8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -412,15 +412,15 @@ def test_orthogonal_indexing(self): assert_identical(expected, actual) def test_vectorized_indexing(self): - # Make sure vectorized_indexing works or at least raises - # NotImplementedError in_memory = create_test_data() with self.roundtrip(in_memory) as on_disk: indexers = {'dim1': DataArray([0, 2, 0], dims='a'), 'dim2': DataArray([0, 2, 3], dims='a')} expected = in_memory.isel(**indexers) actual = on_disk.isel(**indexers) - assert_identical(expected, actual) + # make sure the array is not yet loaded + assert not actual['var1'].variable._in_memory + assert_identical(expected, actual.load()) # do it twice, to make sure we're switched from # orthogonal -> numpy when we cached the values actual = on_disk.isel(**indexers) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 2a5c9ce1e31..86ac8acbcce 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -199,35 +199,38 @@ def test_lazily_indexed_array(self): def test_vectorized_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) + v_eager = Variable(['i', 'j', 'k'], x) lazy = indexing.LazilyIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() # noqa: E741 # allow ambiguous name - def check_indexing(va, indexers): + def check_indexing(v_eager, v_lazy, indexers): for indexer in indexers: - actual = va[indexer] - indexer = tuple(getattr(ind, 'data', ind) for ind in indexer) - expected = np.asarray(va)[indexer] + actual = v_lazy[indexer] + expected = v_eager[indexer] assert expected.shape == actual.shape - assert isinstance(actual._data, indexing.LazilyIndexedArray) + assert isinstance(actual._data, + (indexing.LazilyVectorizedIndexedArray, + indexing.LazilyIndexedArray)) assert_array_equal(expected, actual) - va = actual + v_eager = expected + v_lazy = actual # test orthogonal indexing indexers = [(I[:], 0, 1), (Variable('i', [0, 1]), )] - check_indexing(v_lazy, indexers) + check_indexing(v_eager, v_lazy, indexers) # vectorized indexing indexers = [ (Variable('i', [0, 1]), Variable('i', [0, 1]), slice(None)), (slice(1, 3, 2), 0)] - check_indexing(v_lazy, indexers) + check_indexing(v_eager, v_lazy, indexers) indexers = [ (slice(None, None, 2), 0, slice(None, 10)), - (Variable('i', [3, 2, 4]), Variable('i', [3, 2, 1])), - (slice(0, -1, 2), )] - check_indexing(v_lazy, indexers) + (Variable('i', [3, 2, 4, 3]), Variable('i', [3, 2, 1, 0])), + (Variable(['i', 'j'], [[0, 1], [1, 2]]), )] + check_indexing(v_eager, v_lazy, indexers) class TestCopyOnWriteArray(TestCase): @@ -367,20 +370,28 @@ def test_vectorized_indexer(): np.arange(5, dtype=np.int64))) -def test_vectorized_indexer_utils(): - data = indexing.NumpyIndexingAdapter(np.random.randn(10, 12, 13)) - indexers = [np.array([[0, 3, 2], ]), np.array([[0, 3, 3], [4, 6, 7]]), - slice(2, -2, 2), slice(2, -2, 3), slice(None)] - - for i, j, k in itertools.product(indexers, repeat=3): - vindex = indexing.VectorizedIndexer((i, j, k)) - expected = data[vindex].shape - actual = vindex.infer_shape_of(data.shape) - assert expected == actual - - oind, vind = vindex.decompose() - np.testing.assert_array_equal( - data[vindex], indexing.NumpyIndexingAdapter(data[oind])[vind]) +class Test_vectorized_indexer(TestCase): + def setUp(self): + self.data = indexing.NumpyIndexingAdapter(np.random.randn(10, 12, 13)) + self.indexers = [np.array([[0, 3, 2], ]), + np.array([[0, 3, 3], [4, 6, 7]]), + slice(2, -2, 2), slice(2, -2, 3), slice(None)] + + def test_decompose_vectorized_indexer(self): + for i, j, k in itertools.product(self.indexers, repeat=3): + vindex = indexing.VectorizedIndexer((i, j, k)) + oind, vind = indexing._decompose_vectorized_indexer(vindex) + np.testing.assert_array_equal( + self.data[vindex], + indexing.NumpyIndexingAdapter(self.data[oind])[vind]) + + def test_arrayize_vectorized_indexer(self): + for i, j, k in itertools.product(self.indexers, repeat=3): + vindex = indexing.VectorizedIndexer((i, j, k)) + vindex_array = indexing._arrayize_vectorized_indexer( + vindex, self.data.shape) + np.testing.assert_array_equal( + self.data[vindex], self.data[vindex_array],) def test_unwrap_explicit_indexer(): From 3e05a165833c9ce1e0519535affd36e1145bd514 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 11 Feb 2018 23:21:49 +0900 Subject: [PATCH 06/32] Support vectorized_indexing in h5py --- xarray/backends/h5netcdf_.py | 34 ++++++++++++++++++++++++++++------ xarray/core/indexing.py | 4 ++-- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index cba1d33115f..6439bc1fc52 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -19,13 +19,35 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): key = indexing.unwrap_explicit_indexer( key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer)) - # h5py requires using lists for fancy indexing: - # https://github.com/h5py/h5py/issues/992 - # OuterIndexer only holds 1D integer ndarrays, so it's safe to convert - # them to lists. - key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in key) + + # Convert array-indexers to slice except most effective one. + key = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k + for k, s in zip(key, self.shape)] + gains = [(np.max(k) - np.min(k) + 1.0) / len(np.unique(k)) + if isinstance(k, np.ndarray) else 0 for k in key] + array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None + proper_key, eager_key = [], [] + for i, k in enumerate(key): + if isinstance(k, np.ndarray) and i != array_index: + proper_key.append(slice(np.min(k), np.max(k) + 1)) + eager_key.append(k - np.min(k)) + elif isinstance(k, np.ndarray): + # h5py requires increasing, non-duplicated key + pkey, ekey = np.unique(k, return_inverse=True) + # h5py requires using lists for fancy indexing: + # https://github.com/h5py/h5py/issues/992 + proper_key.append(list(pkey)) + eager_key.append(ekey) + else: + proper_key.append(k) + eager_key.append(slice(None)) + with self.datastore.ensure_open(autoclose=True): - return self.get_array()[key] + array = np.asarray(self.get_array()[tuple(proper_key)], + dtype=self.dtype) + + key = indexing._outer_to_numpy_indexer(tuple(eager_key), self.shape) + return array[key] def maybe_decode_bytes(txt): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 9973e8c5fc8..0e30fa53c30 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -566,8 +566,8 @@ def __array__(self, dtype=None): return array def _updated_key(self, new_key): - return VectorizedIndexer( - tuple(o[new_key.tuple] for o in np.broadcast_arrays(*self.key.tuple))) + return VectorizedIndexer(tuple(o[new_key.tuple] for o in + np.broadcast_arrays(*self.key.tuple))) def __getitem__(self, indexer): return type(self)(self.array, self._updated_key(indexer)) From 030a2c43064090c289e806ef6f4cd96cfbd1125e Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 13 Feb 2018 22:54:04 +0900 Subject: [PATCH 07/32] Refactoring backend array. Added indexing.decompose_indexers. Drop unwrap_explicit_indexers. --- xarray/backends/h5netcdf_.py | 36 ++----- xarray/backends/netCDF4_.py | 12 ++- xarray/backends/pydap_.py | 12 ++- xarray/backends/pynio_.py | 10 +- xarray/backends/rasterio_.py | 6 +- xarray/core/indexing.py | 194 +++++++++++++++++++++++++++------- xarray/tests/test_indexing.py | 61 +++++++---- 7 files changed, 229 insertions(+), 102 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 6439bc1fc52..bcde99a8a65 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -17,37 +17,19 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer)) - - # Convert array-indexers to slice except most effective one. - key = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k - for k, s in zip(key, self.shape)] - gains = [(np.max(k) - np.min(k) + 1.0) / len(np.unique(k)) - if isinstance(k, np.ndarray) else 0 for k in key] - array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None - proper_key, eager_key = [], [] - for i, k in enumerate(key): - if isinstance(k, np.ndarray) and i != array_index: - proper_key.append(slice(np.min(k), np.max(k) + 1)) - eager_key.append(k - np.min(k)) - elif isinstance(k, np.ndarray): - # h5py requires increasing, non-duplicated key - pkey, ekey = np.unique(k, return_inverse=True) - # h5py requires using lists for fancy indexing: - # https://github.com/h5py/h5py/issues/992 - proper_key.append(list(pkey)) - eager_key.append(ekey) - else: - proper_key.append(k) - eager_key.append(slice(None)) + key, np_inds = indexing.decompose_indexer(key, self.shape, + mode='outer_1vector') + # h5py requires using lists for fancy indexing: + # https://github.com/h5py/h5py/issues/992 + key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in key) with self.datastore.ensure_open(autoclose=True): - array = np.asarray(self.get_array()[tuple(proper_key)], + array = np.asarray(self.get_array()[key.tuple], dtype=self.dtype) + for ind in np_inds: + array = indexing.NumpyIndexingAdapter(array)[ind] - key = indexing._outer_to_numpy_indexer(tuple(eager_key), self.shape) - return array[key] + return array def maybe_decode_bytes(txt): diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index b3cb1d8e49f..0aed2a16ce6 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -50,9 +50,8 @@ def get_array(self): class NetCDF4ArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer)) - + key, np_inds = indexing.decompose_indexer(key, self.shape, + mode='outer') if self.datastore.is_remote: # pragma: no cover getitem = functools.partial(robust_getitem, catch=RuntimeError) else: @@ -60,7 +59,7 @@ def __getitem__(self, key): with self.datastore.ensure_open(autoclose=True): try: - data = getitem(self.get_array(), key) + array = getitem(self.get_array(), key.tuple) except IndexError: # Catch IndexError in netCDF4 and return a more informative # error message. This is most often called when an unsorted @@ -73,7 +72,10 @@ def __getitem__(self, key): msg += '\n\nOriginal traceback:\n' + traceback.format_exc() raise IndexError(msg) - return data + for ind in np_inds: + array = indexing.NumpyIndexingAdapter(array)[ind] + + return array def _encode_nc4_variable(var): diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 297d96e47f4..571160d915a 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -24,18 +24,22 @@ def dtype(self): return self.array.dtype def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, target=self, allow=indexing.BasicIndexer) + key, np_inds = indexing.decompose_indexer(key, self.shape, + mode='basic') # pull the data from the array attribute if possible, to avoid # downloading coordinate data twice array = getattr(self.array, 'array', self.array) - result = robust_getitem(array, key, catch=ValueError) + result = robust_getitem(array, key.tuple, catch=ValueError) # pydap doesn't squeeze axes automatically like numpy - axis = tuple(n for n, k in enumerate(key) + axis = tuple(n for n, k in enumerate(key.tuple) if isinstance(k, integer_types)) if len(axis) > 0: result = np.squeeze(result, axis) + + for ind in np_inds: + result = indexing.NumpyIndexingAdapter(result)[ind] + return result diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 37f1db1f6a7..c7e0ddf10dd 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -27,14 +27,18 @@ def get_array(self): return self.datastore.ds.variables[self.variable_name] def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, target=self, allow=indexing.BasicIndexer) + key, np_inds = indexing.decompose_indexer(key, self.shape, + mode='outer') with self.datastore.ensure_open(autoclose=True): array = self.get_array() if key == () and self.ndim == 0: return array.get_value() - return array[key] + + for ind in np_inds: + array = indexing.NumpyIndexingAdapter(array)[ind] + + return array class NioDataStore(AbstractDataStore, DataStorePickleMixin): diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index c624c1f5ff8..4534edfb388 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -41,8 +41,10 @@ def shape(self): return self._shape def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer)) + # TODO support indexing.decompose_indexer + if isinstance(key, indexing.VectorizedIndexer): + raise NotImplementedError + key = key.tuple # bands cannot be windowed but they can be listed band_key = key[0] diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 0e30fa53c30..d92dc85d8c4 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -427,23 +427,6 @@ def __array__(self, dtype=None): return np.asarray(self[key], dtype=dtype) -def unwrap_explicit_indexer(key, target, allow): - """Unwrap an explicit key into a tuple.""" - if not isinstance(key, ExplicitIndexer): - raise TypeError('unexpected key type: {}'.format(key)) - if not isinstance(key, allow): - key_type_name = { - BasicIndexer: 'Basic', - OuterIndexer: 'Outer', - VectorizedIndexer: 'Vectorized' - }[type(key)] - raise NotImplementedError( - '{} indexing for {} is not implemented. Load your data first with ' - '.load(), .compute() or .persist(), or disable caching by setting ' - 'cache=False in open_dataset.'.format(key_type_name, type(target))) - return key.tuple - - class ImplicitToExplicitIndexingAdapter(utils.NDArrayMixin): """Wrap an array, converting tuples into the indicated explicit indexer.""" @@ -556,14 +539,7 @@ def shape(self): return np.broadcast(*self.key.tuple).shape def __array__(self, dtype=None): - try: - array = np.asarray(self.array[self.key], dtype=None) - except NotImplementedError: - # if vectorized indexing is not supported - oind, vind = _decompose_vectorized_indexer(self.key) - array = NumpyIndexingAdapter(np.asarray(self.array[oind], - dtype=None))[vind] - return array + return np.asarray(self.array[self.key], dtype=None) def _updated_key(self, new_key): return VectorizedIndexer(tuple(o[new_key.tuple] for o in @@ -578,7 +554,9 @@ def transpose(self, order): return type(self)(self.array, key) def __setitem__(self, key, value): - raise NotImplementedError + raise NotImplementedError( + 'Lazy item assignment with the vectorized indexer is not yet ' + 'implemented. Load your data first by .load() or compute().') def __repr__(self): return ('%s(array=%r, key=%r)' % @@ -708,22 +686,162 @@ def _outer_to_numpy_indexer(key, shape): return _outer_to_vectorized_indexer(key, shape) -def _decompose_vectorized_indexer(indexer): - """ Decompose vectorized indexer to outer and vectorized indexers, - array[indexer] == array[oindex][vindex] - such that array[oindex].shape becomes smallest. +def decompose_indexer(indexer, shape, mode): + if isinstance(indexer, VectorizedIndexer): + return _decompose_vectorized_indexer(indexer, shape, mode) + if isinstance(indexer, OuterIndexer): + return _decompose_outer_indexer(indexer, shape, mode) + if isinstance(indexer, BasicIndexer): + return indexer, () + raise TypeError('unexpected key type: {}'.format(indexer)) + + +def _decompose_vectorized_indexer(indexer, shape, mode): + """ + Decompose vectorized indexer to the successive two indexers, where the + first indexer will be used to index backend arrays, while the second one + is used to index loaded on-memory np.ndarray. + + Parameters + ---------- + indexer: VectorizedIndexer + mode: str, one of ['basic' | 'outer' | 'outer_1vector' | 'vectorized'] + basic: for backends that support onle basic indexer + outer: for backends that support basic / outer indexer + outer_1vector: for backends that support outer indexer inluding at most + 1 vector. + vectorized: for backends that support full vectorized indexer. + + Returns + ------- + backend_indexer: OuterIndexer or BasicIndexer + np_indexers: a sequence of vectorizedIndexer + + Note + ---- + This function is used to realize the vectorized indexing for the backend + arrays that only support basic or outer indexing. + + As an example, let us consider to index a few elements from a backend array + with a vectorized indexer ([0, 3, 1], [2, 3, 2]). + Even if the backend array only supports outer indexing, it is more + efficient to load a subslice of the array than loading the entire array, + + >>> backend_indexer = OuterIndexer([0, 1, 3], [2, 3]) + >>> array = array[backend_indexer] # load subslice of the array + >>> np_indexer = VectorizedIndexer([0, 2, 1], [0, 1, 0]) + >>> array[np_indexer] # vectorized indexing for on-memory np.ndarray. """ - oindex = [] - vindex = [] - for k in indexer.tuple: + assert isinstance(indexer, VectorizedIndexer) + + if mode == 'vectorized': + return indexer, () + + backend_indexer = [] + np_indexer = [] + # posify + indexer = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k + for k, s in zip(indexer.tuple, shape)] + + for k in indexer: if isinstance(k, slice): - oindex.append(k) - vindex.append(slice(None)) + backend_indexer.append(k) + np_indexer.append(slice(None)) else: # np.ndarray oind, vind = np.unique(k, return_inverse=True) - oindex.append(oind) - vindex.append(vind.reshape(*k.shape)) - return OuterIndexer(tuple(oindex)), VectorizedIndexer(tuple(vindex)) + backend_indexer.append(oind) + np_indexer.append(vind.reshape(*k.shape)) + + backend_indexer = OuterIndexer(tuple(backend_indexer)) + np_indexer = VectorizedIndexer(tuple(np_indexer)) + + if mode == 'outer': + return backend_indexer, (np_indexer, ) + + shape = tuple(s if isinstance(k, slice) else len(k) for k, s in + zip(backend_indexer.tuple, shape)) + backend_indexer, np_indexers = _decompose_outer_indexer( + backend_indexer, shape, mode) + return backend_indexer, (np_indexers[0], np_indexer) + + +def _decompose_outer_indexer(indexer, shape, mode): + """ + Decompose ouer indexer to the successive two indexers, where the + first indexer will be used to index backend arrays, while the second one + is ised to index the loaded on-memory np.ndarray. + Note + ---- + This function is used to realize the vectorized indexing for the backend + arrays that only support basic or outer indexing. + + As an example, let us consider to index a few elements from a backend array + with a orthogonal indexer ([0, 3, 1], [2, 3, 2]). + Even if the backend array only supports basic indexing, it is more + efficient to load a subslice of the array than loading the entire array, + + >>> backend_indexer = BasicIndexer(slice(0, 3), slice(2, 3)) + >>> array = array[backend_indexer] # load subslice of the array + >>> np_indexer = OuterIndexer([0, 2, 1], [0, 1, 0]) + >>> array[np_indexer] # outer indexing for on-memory np.ndarray. + """ + if mode in ['outer', 'vectorized']: + return indexer, () + assert isinstance(indexer, OuterIndexer) + + backend_indexer = [] + np_indexer = [] + # posify + indexer = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k + for k, s in zip(indexer.tuple, shape)] + + if mode == 'outer_1vector': + # some backends such as h5py supports only 1 vector in indexers + # We choose the most efficient axis + gains = [(np.max(k) - np.min(k) + 1.0) / len(np.unique(k)) + if isinstance(k, np.ndarray) else 0 for k in indexer] + array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None + + for i, k in enumerate(indexer): + if isinstance(k, np.ndarray) and i != array_index: + backend_indexer.append(slice(np.min(k), np.max(k) + 1)) + np_indexer.append(k - np.min(k)) + elif isinstance(k, np.ndarray): + pkey, ekey = np.unique(k, return_inverse=True) + backend_indexer.append(pkey) + np_indexer.append(ekey) + else: + backend_indexer.append(k) + np_indexer.append(slice(None)) + + return (OuterIndexer(tuple(backend_indexer)), + (OuterIndexer(tuple(np_indexer)), )) + + if mode == 'outer_sorted': + for k in indexer: + if (isinstance(k, integer_types + (slice, )) or + (np.diff(k) >= 0).all()): + backend_indexer.append(k) + np_indexer.append(slice(None)) + else: # not sorted np.ndarray indexer + oind, vind = np.unique(k, return_inverse=True) + backend_indexer.append(oind) + np_indexer.append(vind.reshape(*k.shape)) + + return (OuterIndexer(tuple(backend_indexer)), + (OuterIndexer(tuple(np_indexer)), )) + + # basic + for i, k in enumerate(indexer): + if isinstance(k, np.ndarray): + backend_indexer.append(slice(np.min(k), np.max(k) + 1)) + np_indexer.append(k - np.min(k)) + else: + backend_indexer.append(k) + np_indexer.append(slice(None)) + + return (BasicIndexer(tuple(backend_indexer)), + (OuterIndexer(tuple(np_indexer)), )) def _arrayize_vectorized_indexer(indexer, shape): diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 86ac8acbcce..78607329e29 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -377,14 +377,6 @@ def setUp(self): np.array([[0, 3, 3], [4, 6, 7]]), slice(2, -2, 2), slice(2, -2, 3), slice(None)] - def test_decompose_vectorized_indexer(self): - for i, j, k in itertools.product(self.indexers, repeat=3): - vindex = indexing.VectorizedIndexer((i, j, k)) - oind, vind = indexing._decompose_vectorized_indexer(vindex) - np.testing.assert_array_equal( - self.data[vindex], - indexing.NumpyIndexingAdapter(self.data[oind])[vind]) - def test_arrayize_vectorized_indexer(self): for i, j, k in itertools.product(self.indexers, repeat=3): vindex = indexing.VectorizedIndexer((i, j, k)) @@ -394,21 +386,44 @@ def test_arrayize_vectorized_indexer(self): self.data[vindex], self.data[vindex_array],) -def test_unwrap_explicit_indexer(): - indexer = indexing.BasicIndexer((1, 2)) - target = None - - unwrapped = indexing.unwrap_explicit_indexer( - indexer, target, allow=indexing.BasicIndexer) - assert unwrapped == (1, 2) - - with raises_regex(NotImplementedError, 'Load your data'): - indexing.unwrap_explicit_indexer( - indexer, target, allow=indexing.OuterIndexer) - - with raises_regex(TypeError, 'unexpected key type'): - indexing.unwrap_explicit_indexer( - indexer.tuple, target, allow=indexing.OuterIndexer) +def get_indexers(shape, mode): + if mode == 'vectorized': + indexed_shape = (3, 4) + indexer = tuple(np.random.randint(0, s, size=indexed_shape) + for s in shape) + return indexing.VectorizedIndexer(indexer) + + elif mode == 'outer': + indexer = tuple(np.random.randint(0, s, s + 2) for s in shape) + return indexing.OuterIndexer(indexer) + + elif mode == 'outer1vec': + indexer = [slice(2, -3) for s in shape] + indexer[1] = np.random.randint(0, shape[1], shape[1] + 2) + return indexing.OuterIndexer(tuple(indexer)) + + else: # basic indexer + indexer = [slice(2, -3) for s in shape] + indexer[0] = 3 + return indexing.BasicIndexer(tuple(indexer)) + + +@pytest.mark.parametrize('shape', [(10, 5, 8), (10, 3)]) +@pytest.mark.parametrize('indexer_mode', ['vectorized', 'outer', 'outer1vec']) +@pytest.mark.parametrize('decompose_mode', + ['vectorized', 'outer', 'outer_1vector', 'basic']) +def test_decompose_indexers(shape, indexer_mode, decompose_mode): + data = np.random.randn(*shape) + indexer = get_indexers(shape, indexer_mode) + + backend_ind, np_ind = indexing.decompose_indexer(indexer, shape, + mode=decompose_mode) + + expected = indexing.NumpyIndexingAdapter(data)[indexer] + array = indexing.NumpyIndexingAdapter(data)[backend_ind] + for ind in np_ind: + array = indexing.NumpyIndexingAdapter(array)[ind] + np.testing.assert_array_equal(expected, array) def test_implicit_indexing_adapter(): From b9f97b4e627ac8d038a4ee4b3e6289af304dfd18 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 14 Feb 2018 14:45:49 +0900 Subject: [PATCH 08/32] typo --- xarray/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d92dc85d8c4..8721eb22656 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -642,8 +642,8 @@ def _outer_to_vectorized_indexer(key, shape): ------- tuple Tuple suitable for use to index a NumPy array with vectorized indexing. - Each element is array: broadcasting them together gives the shape of - the result. + Each element is an array: broadcasting them together gives the shape + of the result. """ n_dim = len([k for k in key if not isinstance(k, integer_types)]) i_dim = 0 From 850f29cef35da728508474431d23a4a84da17708 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 14 Feb 2018 17:50:01 +0900 Subject: [PATCH 09/32] bugfix and typo --- doc/whats-new.rst | 4 +++- xarray/backends/h5netcdf_.py | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f2172810364..3fbff0d2a6d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,7 +43,9 @@ Documentation Enhancements ~~~~~~~~~~~~ -- Lazy vectorized-indexing support. (:issue:`1897`) +- Support lazy vectorized-indexing. After this change, flexible indexing such + as orthogonal/vectorized indexing, becomes possible for all the backend + arrays. (:issue:`1897`) By `Keisuke Fujii `_. - reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` arguments. (:issue:`1838`) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index bcde99a8a65..4b328cb198d 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -19,13 +19,14 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): key, np_inds = indexing.decompose_indexer(key, self.shape, mode='outer_1vector') + # h5py requires using lists for fancy indexing: # https://github.com/h5py/h5py/issues/992 - key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in key) - + key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in + key.tuple) with self.datastore.ensure_open(autoclose=True): - array = np.asarray(self.get_array()[key.tuple], - dtype=self.dtype) + array = np.asarray(self.get_array()[key], dtype=self.dtype) + for ind in np_inds: array = indexing.NumpyIndexingAdapter(array)[ind] From 943ec78bb31a734e753b006a0f3d489c19d36982 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 15 Feb 2018 08:56:35 +0900 Subject: [PATCH 10/32] Fix based on @WeatherGod comments. --- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/pynio_.py | 5 +++-- xarray/core/indexing.py | 30 ++++++++++++++++++++++++------ xarray/tests/test_backends.py | 32 ++++---------------------------- 4 files changed, 32 insertions(+), 37 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 4b328cb198d..85343f7d53b 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -25,7 +25,7 @@ def __getitem__(self, key): key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in key.tuple) with self.datastore.ensure_open(autoclose=True): - array = np.asarray(self.get_array()[key], dtype=self.dtype) + array = self.get_array()[key] for ind in np_inds: array = indexing.NumpyIndexingAdapter(array)[ind] diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index c7e0ddf10dd..18360b2dc30 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -28,13 +28,14 @@ def get_array(self): def __getitem__(self, key): key, np_inds = indexing.decompose_indexer(key, self.shape, - mode='outer') + mode='basic') with self.datastore.ensure_open(autoclose=True): array = self.get_array() - if key == () and self.ndim == 0: + if key.tuple == () and self.ndim == 0: return array.get_value() + array = array[key.tuple] for ind in np_inds: array = indexing.NumpyIndexingAdapter(array)[ind] diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 8721eb22656..f280a736560 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -706,10 +706,10 @@ def _decompose_vectorized_indexer(indexer, shape, mode): ---------- indexer: VectorizedIndexer mode: str, one of ['basic' | 'outer' | 'outer_1vector' | 'vectorized'] - basic: for backends that support onle basic indexer + basic: for backends that support only basic indexer outer: for backends that support basic / outer indexer - outer_1vector: for backends that support outer indexer inluding at most - 1 vector. + outer_1vector: for backends that support outer indexer including + at most 1 vector. vectorized: for backends that support full vectorized indexer. Returns @@ -739,12 +739,14 @@ def _decompose_vectorized_indexer(indexer, shape, mode): backend_indexer = [] np_indexer = [] - # posify + # convert negative indices indexer = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k for k, s in zip(indexer.tuple, shape)] for k in indexer: if isinstance(k, slice): + # If it is a slice, then we will slice it as-is (k) in the backend, + # and then use all of it (slice(None)) for the in-memory portion. backend_indexer.append(k) np_indexer.append(slice(None)) else: # np.ndarray @@ -767,9 +769,25 @@ def _decompose_vectorized_indexer(indexer, shape, mode): def _decompose_outer_indexer(indexer, shape, mode): """ - Decompose ouer indexer to the successive two indexers, where the + Decompose outer indexer to the successive two indexers, where the first indexer will be used to index backend arrays, while the second one - is ised to index the loaded on-memory np.ndarray. + is used to index the loaded on-memory np.ndarray. + + Parameters + ---------- + indexer: VectorizedIndexer + mode: str, one of ['basic' | 'outer' | 'outer_1vector' | 'vectorized'] + basic: for backends that support onle basic indexer + outer: for backends that support basic / outer indexer + outer_1vector: for backends that support outer indexer including + at most 1 vector. + vectorized: for backends that support full vectorized indexer. + + Returns + ------- + backend_indexer: OuterIndexer or BasicIndexer + np_indexers: a sequence of OuterIndexer + Note ---- This function is used to realize the vectorized indexing for the backend diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f7233658fb8..1f0a5be5513 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -405,6 +405,8 @@ def test_orthogonal_indexing(self): 'dim3': np.arange(5)} expected = in_memory.isel(**indexers) actual = on_disk.isel(**indexers) + # make sure the array is not yet loaded into memory + assert not actual['var1'].variable._in_memory assert_identical(expected, actual) # do it twice, to make sure we're switched from orthogonal -> numpy # when we cached the values @@ -418,11 +420,11 @@ def test_vectorized_indexing(self): 'dim2': DataArray([0, 2, 3], dims='a')} expected = in_memory.isel(**indexers) actual = on_disk.isel(**indexers) - # make sure the array is not yet loaded + # make sure the array is not yet loaded into memory assert not actual['var1'].variable._in_memory assert_identical(expected, actual.load()) # do it twice, to make sure we're switched from - # orthogonal -> numpy when we cached the values + # vectorized -> numpy when we cached the values actual = on_disk.isel(**indexers) assert_identical(expected, actual) @@ -1554,19 +1556,6 @@ def create_store(self): with create_tmp_file() as tmp_file: yield backends.H5NetCDFStore(tmp_file, 'w') - def test_orthogonal_indexing(self): - # simplified version for h5netcdf - in_memory = create_test_data() - with self.roundtrip(in_memory) as on_disk: - indexers = {'dim3': np.arange(5)} - expected = in_memory.isel(**indexers) - actual = on_disk.isel(**indexers) - assert_identical(expected, actual.load()) - - def test_array_type_after_indexing(self): - # h5netcdf does not support multiple list-like indexers - pass - def test_complex(self): expected = Dataset({'x': ('y', np.ones(5) + 1j * np.ones(5))}) with self.roundtrip(expected) as actual: @@ -2117,19 +2106,6 @@ def test_write_store(self): # pynio is read-only for now pass - def test_orthogonal_indexing(self): - # pynio also does not support list-like indexing - with raises_regex(NotImplementedError, 'Outer indexing'): - super(TestPyNio, self).test_orthogonal_indexing() - - def test_isel_dataarray(self): - with raises_regex(NotImplementedError, 'Outer indexing'): - super(TestPyNio, self).test_isel_dataarray() - - def test_array_type_after_indexing(self): - # pynio also does not support list-like indexing - pass - @contextlib.contextmanager def open(self, path, **kwargs): with open_dataset(path, engine='pynio', autoclose=self.autoclose, From 91aae64b27cf847e88ec83e21f91d85dd1085f74 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 15 Feb 2018 17:45:01 +0900 Subject: [PATCH 11/32] Use enum-like object to indicate indexing-support types. --- xarray/backends/h5netcdf_.py | 4 +-- xarray/backends/netCDF4_.py | 4 +-- xarray/backends/pydap_.py | 4 +-- xarray/backends/pynio_.py | 4 +-- xarray/core/indexing.py | 47 ++++++++++++++++++------------------ 5 files changed, 32 insertions(+), 31 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 85343f7d53b..87ae98e8ccc 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -17,8 +17,8 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key, np_inds = indexing.decompose_indexer(key, self.shape, - mode='outer_1vector') + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.OUTER_1VECTOR) # h5py requires using lists for fancy indexing: # https://github.com/h5py/h5py/issues/992 diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 0aed2a16ce6..3830a0f079f 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -50,8 +50,8 @@ def get_array(self): class NetCDF4ArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key, np_inds = indexing.decompose_indexer(key, self.shape, - mode='outer') + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.OUTER) if self.datastore.is_remote: # pragma: no cover getitem = functools.partial(robust_getitem, catch=RuntimeError) else: diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 571160d915a..9eaa1b25ece 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -24,8 +24,8 @@ def dtype(self): return self.array.dtype def __getitem__(self, key): - key, np_inds = indexing.decompose_indexer(key, self.shape, - mode='basic') + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.BASIC) # pull the data from the array attribute if possible, to avoid # downloading coordinate data twice diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 18360b2dc30..c84df3f6200 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -27,8 +27,8 @@ def get_array(self): return self.datastore.ds.variables[self.variable_name] def __getitem__(self, key): - key, np_inds = indexing.decompose_indexer(key, self.shape, - mode='basic') + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.BASIC) with self.datastore.ensure_open(autoclose=True): array = self.get_array() diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f280a736560..e459c29a495 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -686,17 +686,28 @@ def _outer_to_numpy_indexer(key, shape): return _outer_to_vectorized_indexer(key, shape) -def decompose_indexer(indexer, shape, mode): +class IndexingSupport(object): # could inherit from enum.Enum on Python 3 + # for backends that support only basic indexer + BASIC = 'BASIC' + # for backends that support basic / outer indexer + OUTER = 'OUTER' + # for backends that support outer indexer including at most 1 vector. + OUTER_1VECTOR = 'OUTER_1VECTOR' + # for backends that support full vectorized indexer. + VECTORIZED = 'VECTORIZED' + + +def decompose_indexer(indexer, shape, indexing_support): if isinstance(indexer, VectorizedIndexer): - return _decompose_vectorized_indexer(indexer, shape, mode) + return _decompose_vectorized_indexer(indexer, shape, indexing_support) if isinstance(indexer, OuterIndexer): - return _decompose_outer_indexer(indexer, shape, mode) + return _decompose_outer_indexer(indexer, shape, indexing_support) if isinstance(indexer, BasicIndexer): return indexer, () raise TypeError('unexpected key type: {}'.format(indexer)) -def _decompose_vectorized_indexer(indexer, shape, mode): +def _decompose_vectorized_indexer(indexer, shape, indexing_support): """ Decompose vectorized indexer to the successive two indexers, where the first indexer will be used to index backend arrays, while the second one @@ -705,12 +716,7 @@ def _decompose_vectorized_indexer(indexer, shape, mode): Parameters ---------- indexer: VectorizedIndexer - mode: str, one of ['basic' | 'outer' | 'outer_1vector' | 'vectorized'] - basic: for backends that support only basic indexer - outer: for backends that support basic / outer indexer - outer_1vector: for backends that support outer indexer including - at most 1 vector. - vectorized: for backends that support full vectorized indexer. + indexing_support: one of IndexerSupport entries Returns ------- @@ -734,7 +740,7 @@ def _decompose_vectorized_indexer(indexer, shape, mode): """ assert isinstance(indexer, VectorizedIndexer) - if mode == 'vectorized': + if indexing_support is IndexingSupport.VECTORIZED: return indexer, () backend_indexer = [] @@ -757,17 +763,17 @@ def _decompose_vectorized_indexer(indexer, shape, mode): backend_indexer = OuterIndexer(tuple(backend_indexer)) np_indexer = VectorizedIndexer(tuple(np_indexer)) - if mode == 'outer': + if indexing_support is IndexingSupport.OUTER: return backend_indexer, (np_indexer, ) shape = tuple(s if isinstance(k, slice) else len(k) for k, s in zip(backend_indexer.tuple, shape)) backend_indexer, np_indexers = _decompose_outer_indexer( - backend_indexer, shape, mode) + backend_indexer, shape, indexing_support) return backend_indexer, (np_indexers[0], np_indexer) -def _decompose_outer_indexer(indexer, shape, mode): +def _decompose_outer_indexer(indexer, shape, indexing_support): """ Decompose outer indexer to the successive two indexers, where the first indexer will be used to index backend arrays, while the second one @@ -776,12 +782,7 @@ def _decompose_outer_indexer(indexer, shape, mode): Parameters ---------- indexer: VectorizedIndexer - mode: str, one of ['basic' | 'outer' | 'outer_1vector' | 'vectorized'] - basic: for backends that support onle basic indexer - outer: for backends that support basic / outer indexer - outer_1vector: for backends that support outer indexer including - at most 1 vector. - vectorized: for backends that support full vectorized indexer. + indexing_support: One of the entries of IndexingSupport Returns ------- @@ -803,7 +804,7 @@ def _decompose_outer_indexer(indexer, shape, mode): >>> np_indexer = OuterIndexer([0, 2, 1], [0, 1, 0]) >>> array[np_indexer] # outer indexing for on-memory np.ndarray. """ - if mode in ['outer', 'vectorized']: + if indexing_support in [IndexingSupport.OUTER, IndexingSupport.VECTORIZED]: return indexer, () assert isinstance(indexer, OuterIndexer) @@ -813,7 +814,7 @@ def _decompose_outer_indexer(indexer, shape, mode): indexer = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k for k, s in zip(indexer.tuple, shape)] - if mode == 'outer_1vector': + if indexing_support is IndexingSupport.OUTER_1VECTOR: # some backends such as h5py supports only 1 vector in indexers # We choose the most efficient axis gains = [(np.max(k) - np.min(k) + 1.0) / len(np.unique(k)) @@ -835,7 +836,7 @@ def _decompose_outer_indexer(indexer, shape, mode): return (OuterIndexer(tuple(backend_indexer)), (OuterIndexer(tuple(np_indexer)), )) - if mode == 'outer_sorted': + if indexing_support == IndexingSupport.OUTER: for k in indexer: if (isinstance(k, integer_types + (slice, )) or (np.diff(k) >= 0).all()): From 991c1dac20b270da8674d01ee807a0ad9955ba12 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 15 Feb 2018 21:14:05 +0900 Subject: [PATCH 12/32] Update test_decompose_indexers. --- xarray/tests/test_indexing.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 78607329e29..6a70455f4ed 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -410,14 +410,17 @@ def get_indexers(shape, mode): @pytest.mark.parametrize('shape', [(10, 5, 8), (10, 3)]) @pytest.mark.parametrize('indexer_mode', ['vectorized', 'outer', 'outer1vec']) -@pytest.mark.parametrize('decompose_mode', - ['vectorized', 'outer', 'outer_1vector', 'basic']) -def test_decompose_indexers(shape, indexer_mode, decompose_mode): +@pytest.mark.parametrize('indexing_support', + [indexing.IndexingSupport.BASIC, + indexing.IndexingSupport.OUTER, + indexing.IndexingSupport.OUTER_1VECTOR, + indexing.IndexingSupport.VECTORIZED]) +def test_decompose_indexers(shape, indexer_mode, indexing_support): data = np.random.randn(*shape) indexer = get_indexers(shape, indexer_mode) - backend_ind, np_ind = indexing.decompose_indexer(indexer, shape, - mode=decompose_mode) + backend_ind, np_ind = indexing.decompose_indexer( + indexer, shape, indexing_support) expected = indexing.NumpyIndexingAdapter(data)[indexer] array = indexing.NumpyIndexingAdapter(data)[backend_ind] From 936954a318c5af48e4391e9ac38d5880eb663a3e Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 15 Feb 2018 22:55:27 +0900 Subject: [PATCH 13/32] Bugfix and benchmarks. --- asv_bench/benchmarks/dataset_io.py | 52 +++++++++++++++++++++++++++++- xarray/core/indexing.py | 8 ++++- xarray/tests/test_backends.py | 28 +++++++++++++++- xarray/tests/test_indexing.py | 5 +++ 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py index d7766d99a3d..1eec598bf5b 100644 --- a/asv_bench/benchmarks/dataset_io.py +++ b/asv_bench/benchmarks/dataset_io.py @@ -13,7 +13,7 @@ import xarray as xr -from . import randn, requires_dask +from . import randn, randint, requires_dask class IOSingleNetCDF(object): @@ -73,6 +73,15 @@ def make_ds(self): self.ds.attrs = {'history': 'created for xarray benchmarking'} + self.oinds = {'time': randint(0, self.nt, 120), + 'lon': randint(0, self.nx, 20), + 'lat': randint(0, self.ny, 10)} + self.vinds = {'time': xr.DataArray(randint(0, self.nt, 120), + dims='x'), + 'lon': xr.DataArray(randint(0, self.nx, 120), + dims='x'), + 'lat': slice(3, 20)} + class IOWriteSingleNetCDF3(IOSingleNetCDF): def setup(self): @@ -100,6 +109,14 @@ def setup(self): def time_load_dataset_netcdf4(self): xr.open_dataset(self.filepath, engine='netcdf4').load() + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4') + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4') + ds = ds.isel(**self.vinds).load() + class IOReadSingleNetCDF3(IOReadSingleNetCDF4): def setup(self): @@ -113,6 +130,14 @@ def setup(self): def time_load_dataset_scipy(self): xr.open_dataset(self.filepath, engine='scipy').load() + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy') + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy') + ds = ds.isel(**self.vinds).load() + class IOReadSingleNetCDF4Dask(IOSingleNetCDF): def setup(self): @@ -129,6 +154,16 @@ def time_load_dataset_netcdf4_with_block_chunks(self): xr.open_dataset(self.filepath, engine='netcdf4', chunks=self.block_chunks).load() + def time_load_dataset_netcdf4_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4', + chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4', + chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): with dask.set_options(get=dask.multiprocessing.get): xr.open_dataset(self.filepath, engine='netcdf4', @@ -160,6 +195,21 @@ def time_load_dataset_scipy_with_block_chunks(self): xr.open_dataset(self.filepath, engine='scipy', chunks=self.block_chunks).load() + def time_load_dataset_scipy_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy', + chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_scipy_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy', + chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + + def time_load_dataset_scipy_with_block_chunks(self): + with dask.set_options(get=dask.multiprocessing.get): + xr.open_dataset(self.filepath, engine='scipy', + chunks=self.block_chunks).load() + def time_load_dataset_scipy_with_time_chunks(self): with dask.set_options(get=dask.multiprocessing.get): xr.open_dataset(self.filepath, engine='scipy', diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e459c29a495..2b492fed339 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -532,7 +532,6 @@ def __init__(self, array, key): else: self.key = _arrayize_vectorized_indexer(key, array.shape) self.array = as_indexable(array) - self.order = np.arange(self.ndim) @property def shape(self): @@ -542,6 +541,7 @@ def __array__(self, dtype=None): return np.asarray(self.array[self.key], dtype=None) def _updated_key(self, new_key): + new_key = _arrayize_vectorized_indexer(new_key, self.shape) return VectorizedIndexer(tuple(o[new_key.tuple] for o in np.broadcast_arrays(*self.key.tuple))) @@ -587,6 +587,9 @@ def __array__(self, dtype=None): def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) + def transpose(self, order): + return self.array.transpose(order) + def __setitem__(self, key, value): self._ensure_copied() self.array[key] = value @@ -607,6 +610,9 @@ def __array__(self, dtype=None): def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) + def transpose(self, order): + return self.array.transpose(order) + def __setitem__(self, key, value): self.array[key] = value diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1f0a5be5513..4ef2f91db1e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -428,6 +428,33 @@ def test_vectorized_indexing(self): actual = on_disk.isel(**indexers) assert_identical(expected, actual) + # make sure lazy indexing certainly works. + with self.roundtrip(in_memory) as on_disk: + indexers = { + 'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': DataArray([[0, 4], [1, 3], [2, 2]], dims=['a', 'b']), + 'dim2': slice(None, 10)} + expected = in_memory.isel(**indexers)['var3'] + actual = on_disk.isel(**indexers)['var3'] + # make sure the array is not yet loaded into memory + assert not actual.variable._in_memory + indexers = {'a': DataArray([0, 1], dims=['c']), + 'b': DataArray([0, 1], dims=['c'])} + expected = expected.isel(**indexers) + actual = actual.isel(**indexers) + assert not actual.variable._in_memory + assert_identical(expected, actual.load()) + + with self.roundtrip(in_memory) as on_disk: + indexers = { + 'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim2': slice(None, 10)} + expected = in_memory.isel(**indexers)['var3'] + actual = on_disk.isel(**indexers)['var3'] + # make sure the array is not yet loaded into memory + assert not actual.variable._in_memory + assert_identical(expected, actual.load()) + def test_isel_dataarray(self): # Make sure isel works lazily. GH:issue:1688 in_memory = create_test_data() @@ -504,7 +531,6 @@ def test_roundtrip_bytes_with_fill_value(self): encoding = {'_FillValue': b'X', 'dtype': 'S1'} original = Dataset({'x': ('t', values, {}, encoding)}) expected = original.copy(deep=True) - print(original) with self.roundtrip(original) as actual: assert_identical(expected, actual) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 6a70455f4ed..fb2cfbcdca5 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -232,6 +232,11 @@ def check_indexing(v_eager, v_lazy, indexers): (Variable(['i', 'j'], [[0, 1], [1, 2]]), )] check_indexing(v_eager, v_lazy, indexers) + indexers = [ + (Variable('i', [3, 2, 4, 3]), Variable('i', [3, 2, 1, 0])), + (Variable(['i', 'j'], [[0, 1], [1, 2]]), )] + check_indexing(v_eager, v_lazy, indexers) + class TestCopyOnWriteArray(TestCase): def test_setitem(self): From 9144965cce88aff47001eeb1c91615be87798d65 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 22:36:32 +0900 Subject: [PATCH 14/32] fix: support outer/basic indexer in LazilyVectorizedIndexedArray --- xarray/core/indexing.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 2b492fed339..db8101bb0f1 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -527,8 +527,7 @@ def __init__(self, array, key): key : VectorizedIndexer """ if isinstance(key, (BasicIndexer, OuterIndexer)): - self.key = VectorizedIndexer( - _outer_to_vectorized_indexer(key.tuple, array.shape)) + self.key = _outer_to_vectorized_indexer(key, array.shape) else: self.key = _arrayize_vectorized_indexer(key, array.shape) self.array = as_indexable(array) @@ -541,7 +540,10 @@ def __array__(self, dtype=None): return np.asarray(self.array[self.key], dtype=None) def _updated_key(self, new_key): - new_key = _arrayize_vectorized_indexer(new_key, self.shape) + if isinstance(new_key, VectorizedIndexer): + new_key = _arrayize_vectorized_indexer(new_key, self.shape) + else: + new_key = _outer_to_vectorized_indexer(new_key, self.shape) return VectorizedIndexer(tuple(o[new_key.tuple] for o in np.broadcast_arrays(*self.key.tuple))) @@ -639,18 +641,21 @@ def _outer_to_vectorized_indexer(key, shape): Parameters ---------- - key : tuple + key : Outer/Basic Indexer Tuple from an Basic/OuterIndexer to convert. shape : tuple Shape of the array subject to the indexing. Returns ------- - tuple + VectorizedInexer Tuple suitable for use to index a NumPy array with vectorized indexing. Each element is an array: broadcasting them together gives the shape of the result. """ + if isinstance(key, ExplicitIndexer): + key = key.tuple + n_dim = len([k for k in key if not isinstance(k, integer_types)]) i_dim = 0 new_key = [] @@ -665,7 +670,7 @@ def _outer_to_vectorized_indexer(key, shape): (1,) * (n_dim - i_dim - 1)] new_key.append(k.reshape(*shape)) i_dim += 1 - return tuple(new_key) + return VectorizedIndexer(tuple(new_key)) def _outer_to_numpy_indexer(key, shape): @@ -689,7 +694,7 @@ def _outer_to_numpy_indexer(key, shape): # Boolean index should already be converted to integer array. return tuple(key) else: - return _outer_to_vectorized_indexer(key, shape) + return _outer_to_vectorized_indexer(key, shape).tuple class IndexingSupport(object): # could inherit from enum.Enum on Python 3 @@ -936,7 +941,7 @@ def create_mask(indexer, shape, chunks_hint=None): same shape as the indexing result. """ if isinstance(indexer, OuterIndexer): - key = _outer_to_vectorized_indexer(indexer.tuple, shape) + key = _outer_to_vectorized_indexer(indexer.tuple, shape).tuple assert not any(isinstance(k, slice) for k in key) mask = _masked_result_drop_slice(key, chunks_hint) From 95e1f1cc701bf885a953dadc1d042bba6e795dff Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 22:54:22 +0900 Subject: [PATCH 15/32] More comments. --- xarray/core/indexing.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index db8101bb0f1..c2b43d73c00 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -766,7 +766,9 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): # and then use all of it (slice(None)) for the in-memory portion. backend_indexer.append(k) np_indexer.append(slice(None)) - else: # np.ndarray + else: + # If it is a (multidimensional) np.ndarray, just pickup the used + # keys without duplication and store them as a 1d-np.ndarray. oind, vind = np.unique(k, return_inverse=True) backend_indexer.append(oind) np_indexer.append(vind.reshape(*k.shape)) @@ -777,6 +779,8 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): if indexing_support is IndexingSupport.OUTER: return backend_indexer, (np_indexer, ) + # If the backend does not support outer indexing, + # backend_indexer (OuterIndexer) is also decomposed. shape = tuple(s if isinstance(k, slice) else len(k) for k, s in zip(backend_indexer.tuple, shape)) backend_indexer, np_indexers = _decompose_outer_indexer( @@ -834,13 +838,19 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): for i, k in enumerate(indexer): if isinstance(k, np.ndarray) and i != array_index: + # np.ndarray key is converted to slice that covers the entire + # entries of this key. backend_indexer.append(slice(np.min(k), np.max(k) + 1)) np_indexer.append(k - np.min(k)) elif isinstance(k, np.ndarray): + # Remove duplicates and sort them in the increasing order pkey, ekey = np.unique(k, return_inverse=True) backend_indexer.append(pkey) np_indexer.append(ekey) else: + # If it is a slice or an integer, then we will slice it + # as-is (k) in the backend, and then use all of it + # slice(None)) for the in-memory portion. backend_indexer.append(k) np_indexer.append(slice(None)) @@ -853,7 +863,8 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): (np.diff(k) >= 0).all()): backend_indexer.append(k) np_indexer.append(slice(None)) - else: # not sorted np.ndarray indexer + else: + # Remove duplicates and sort them in the increasing order oind, vind = np.unique(k, return_inverse=True) backend_indexer.append(oind) np_indexer.append(vind.reshape(*k.shape)) @@ -864,9 +875,11 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): # basic for i, k in enumerate(indexer): if isinstance(k, np.ndarray): + # np.ndarray key is converted to slice that covers the entire + # entries of this key. backend_indexer.append(slice(np.min(k), np.max(k) + 1)) np_indexer.append(k - np.min(k)) - else: + else: # integer or slice backend_indexer.append(k) np_indexer.append(slice(None)) From 180c4f5a1f51d6d22e707f91c7e23ec76798dd58 Mon Sep 17 00:00:00 2001 From: stickler-ci Date: Fri, 16 Feb 2018 13:54:49 +0000 Subject: [PATCH 16/32] Fixing style errors. --- asv_bench/benchmarks/dataset_io.py | 2 +- xarray/core/indexing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py index 1eec598bf5b..1065f0427df 100644 --- a/asv_bench/benchmarks/dataset_io.py +++ b/asv_bench/benchmarks/dataset_io.py @@ -77,7 +77,7 @@ def make_ds(self): 'lon': randint(0, self.nx, 20), 'lat': randint(0, self.ny, 10)} self.vinds = {'time': xr.DataArray(randint(0, self.nt, 120), - dims='x'), + dims='x'), 'lon': xr.DataArray(randint(0, self.nx, 120), dims='x'), 'lat': slice(3, 20)} diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c2b43d73c00..6a03ba57e13 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -545,7 +545,7 @@ def _updated_key(self, new_key): else: new_key = _outer_to_vectorized_indexer(new_key, self.shape) return VectorizedIndexer(tuple(o[new_key.tuple] for o in - np.broadcast_arrays(*self.key.tuple))) + np.broadcast_arrays(*self.key.tuple))) def __getitem__(self, indexer): return type(self)(self.array, self._updated_key(indexer)) From dbbe531098cb751ec010e4898712ff7ba7386511 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 22:59:48 +0900 Subject: [PATCH 17/32] Remove unintended dupicate --- asv_bench/benchmarks/dataset_io.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py index 1065f0427df..f839d6ccffe 100644 --- a/asv_bench/benchmarks/dataset_io.py +++ b/asv_bench/benchmarks/dataset_io.py @@ -205,11 +205,6 @@ def time_load_dataset_scipy_with_block_chunks_vindexing(self): chunks=self.block_chunks) ds = ds.isel(**self.vinds).load() - def time_load_dataset_scipy_with_block_chunks(self): - with dask.set_options(get=dask.multiprocessing.get): - xr.open_dataset(self.filepath, engine='scipy', - chunks=self.block_chunks).load() - def time_load_dataset_scipy_with_time_chunks(self): with dask.set_options(get=dask.multiprocessing.get): xr.open_dataset(self.filepath, engine='scipy', From c2e61ad17d13a126d4fcf020528dc19e0dd2ee51 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 17 Feb 2018 00:57:54 +0900 Subject: [PATCH 18/32] combine indexers for on-memory np.ndarray. --- xarray/backends/h5netcdf_.py | 4 +-- xarray/backends/netCDF4_.py | 4 +-- xarray/backends/pydap_.py | 2 +- xarray/backends/pynio_.py | 4 +-- xarray/core/indexing.py | 55 ++++++++++++++++++++--------------- xarray/tests/test_indexing.py | 7 +++-- 6 files changed, 42 insertions(+), 34 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 87ae98e8ccc..9b3b69369d7 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -27,8 +27,8 @@ def __getitem__(self, key): with self.datastore.ensure_open(autoclose=True): array = self.get_array()[key] - for ind in np_inds: - array = indexing.NumpyIndexingAdapter(array)[ind] + if len(np_inds.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_inds] return array diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 3fa425f13e1..43900912e44 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -72,8 +72,8 @@ def __getitem__(self, key): msg += '\n\nOriginal traceback:\n' + traceback.format_exc() raise IndexError(msg) - for ind in np_inds: - array = indexing.NumpyIndexingAdapter(array)[ind] + if len(np_inds.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_inds] return array diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 9eaa1b25ece..c9980529103 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -37,7 +37,7 @@ def __getitem__(self, key): if len(axis) > 0: result = np.squeeze(result, axis) - for ind in np_inds: + if len(np_inds.tuple) > 0: result = indexing.NumpyIndexingAdapter(result)[ind] return result diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index c84df3f6200..1129e77b208 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -36,8 +36,8 @@ def __getitem__(self, key): return array.get_value() array = array[key.tuple] - for ind in np_inds: - array = indexing.NumpyIndexingAdapter(array)[ind] + if len(np_inds.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_inds] return array diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 6a03ba57e13..91abc476bf5 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -540,12 +540,7 @@ def __array__(self, dtype=None): return np.asarray(self.array[self.key], dtype=None) def _updated_key(self, new_key): - if isinstance(new_key, VectorizedIndexer): - new_key = _arrayize_vectorized_indexer(new_key, self.shape) - else: - new_key = _outer_to_vectorized_indexer(new_key, self.shape) - return VectorizedIndexer(tuple(o[new_key.tuple] for o in - np.broadcast_arrays(*self.key.tuple))) + return _combine_indexers(self.key, self.shape, new_key) def __getitem__(self, indexer): return type(self)(self.array, self._updated_key(indexer)) @@ -642,7 +637,7 @@ def _outer_to_vectorized_indexer(key, shape): Parameters ---------- key : Outer/Basic Indexer - Tuple from an Basic/OuterIndexer to convert. + An indexer to convert. shape : tuple Shape of the array subject to the indexing. @@ -653,8 +648,7 @@ def _outer_to_vectorized_indexer(key, shape): Each element is an array: broadcasting them together gives the shape of the result. """ - if isinstance(key, ExplicitIndexer): - key = key.tuple + key = key.tuple n_dim = len([k for k in key if not isinstance(k, integer_types)]) i_dim = 0 @@ -678,8 +672,8 @@ def _outer_to_numpy_indexer(key, shape): Parameters ---------- - key : tuple - Tuple from an OuterIndexer to convert. + key : Basic/OuterIndexer + An indexer to convert. shape : tuple Shape of the array subject to the indexing. @@ -688,15 +682,26 @@ def _outer_to_numpy_indexer(key, shape): tuple Tuple suitable for use to index a NumPy array. """ - if len([k for k in key if not isinstance(k, slice)]) <= 1: + if len([k for k in key.tuple if not isinstance(k, slice)]) <= 1: # If there is only one vector and all others are slice, # it can be safely used in mixed basic/advanced indexing. # Boolean index should already be converted to integer array. - return tuple(key) + return key.tuple else: return _outer_to_vectorized_indexer(key, shape).tuple +def _combine_indexers(old_key, old_shape, new_key): + """ Combine two indexers. old_key should be a VectorizedIndexer consisting + of all np.ndarray, while new_key can be all the indxer types.""" + if isinstance(new_key, VectorizedIndexer): + new_key = _arrayize_vectorized_indexer(new_key, old_shape) + else: + new_key = _outer_to_vectorized_indexer(new_key, old_shape) + return VectorizedIndexer(tuple(o[new_key.tuple] for o in + np.broadcast_arrays(*old_key.tuple))) + + class IndexingSupport(object): # could inherit from enum.Enum on Python 3 # for backends that support only basic indexer BASIC = 'BASIC' @@ -714,7 +719,7 @@ def decompose_indexer(indexer, shape, indexing_support): if isinstance(indexer, OuterIndexer): return _decompose_outer_indexer(indexer, shape, indexing_support) if isinstance(indexer, BasicIndexer): - return indexer, () + return indexer, BasicIndexer(()) raise TypeError('unexpected key type: {}'.format(indexer)) @@ -752,7 +757,7 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): assert isinstance(indexer, VectorizedIndexer) if indexing_support is IndexingSupport.VECTORIZED: - return indexer, () + return indexer, BasicIndexer(()) backend_indexer = [] np_indexer = [] @@ -777,15 +782,17 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): np_indexer = VectorizedIndexer(tuple(np_indexer)) if indexing_support is IndexingSupport.OUTER: - return backend_indexer, (np_indexer, ) + return backend_indexer, np_indexer # If the backend does not support outer indexing, # backend_indexer (OuterIndexer) is also decomposed. shape = tuple(s if isinstance(k, slice) else len(k) for k, s in zip(backend_indexer.tuple, shape)) - backend_indexer, np_indexers = _decompose_outer_indexer( + backend_indexer, np_indexer1 = _decompose_outer_indexer( backend_indexer, shape, indexing_support) - return backend_indexer, (np_indexers[0], np_indexer) + np_indexer = _combine_indexers( + _outer_to_vectorized_indexer(np_indexer1, shape), shape, np_indexer) + return backend_indexer, np_indexer def _decompose_outer_indexer(indexer, shape, indexing_support): @@ -820,7 +827,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): >>> array[np_indexer] # outer indexing for on-memory np.ndarray. """ if indexing_support in [IndexingSupport.OUTER, IndexingSupport.VECTORIZED]: - return indexer, () + return indexer, BasicIndexer(()) assert isinstance(indexer, OuterIndexer) backend_indexer = [] @@ -855,7 +862,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): np_indexer.append(slice(None)) return (OuterIndexer(tuple(backend_indexer)), - (OuterIndexer(tuple(np_indexer)), )) + OuterIndexer(tuple(np_indexer))) if indexing_support == IndexingSupport.OUTER: for k in indexer: @@ -870,7 +877,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): np_indexer.append(vind.reshape(*k.shape)) return (OuterIndexer(tuple(backend_indexer)), - (OuterIndexer(tuple(np_indexer)), )) + OuterIndexer(tuple(np_indexer))) # basic for i, k in enumerate(indexer): @@ -884,7 +891,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): np_indexer.append(slice(None)) return (BasicIndexer(tuple(backend_indexer)), - (OuterIndexer(tuple(np_indexer)), )) + OuterIndexer(tuple(np_indexer))) def _arrayize_vectorized_indexer(indexer, shape): @@ -954,7 +961,7 @@ def create_mask(indexer, shape, chunks_hint=None): same shape as the indexing result. """ if isinstance(indexer, OuterIndexer): - key = _outer_to_vectorized_indexer(indexer.tuple, shape).tuple + key = _outer_to_vectorized_indexer(indexer, shape).tuple assert not any(isinstance(k, slice) for k in key) mask = _masked_result_drop_slice(key, chunks_hint) @@ -1049,7 +1056,7 @@ def _ensure_ndarray(self, value): def _indexing_array_and_key(self, key): if isinstance(key, OuterIndexer): array = self.array - key = _outer_to_numpy_indexer(key.tuple, self.array.shape) + key = _outer_to_numpy_indexer(key, self.array.shape) elif isinstance(key, VectorizedIndexer): array = nputils.NumpyVIndexAdapter(self.array) key = key.tuple diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index fb2cfbcdca5..61f3abf48d3 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -429,8 +429,8 @@ def test_decompose_indexers(shape, indexer_mode, indexing_support): expected = indexing.NumpyIndexingAdapter(data)[indexer] array = indexing.NumpyIndexingAdapter(data)[backend_ind] - for ind in np_ind: - array = indexing.NumpyIndexingAdapter(array)[ind] + if len(np_ind.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_ind] np.testing.assert_array_equal(expected, array) @@ -468,7 +468,8 @@ def nonzero(x): expected_data = np.moveaxis(expected_data, old_order, new_order) - outer_index = (nonzero(i), nonzero(j), nonzero(k)) + outer_index = indexing.OuterIndexer((nonzero(i), nonzero(j), + nonzero(k))) actual = indexing._outer_to_numpy_indexer(outer_index, v.shape) actual_data = v.data[actual] np.testing.assert_array_equal(actual_data, expected_data) From b545c3ecf00b9e2ece6a74f7908d4debd2ea2d46 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 17 Feb 2018 00:58:04 +0900 Subject: [PATCH 19/32] fix whats new --- doc/whats-new.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8ac965edf20..819ec2c25b0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -47,7 +47,6 @@ Enhancements as orthogonal/vectorized indexing, becomes possible for all the backend arrays. (:issue:`1897`) By `Keisuke Fujii `_. -- reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` - Reduce methods such as :py:func:`DataArray.sum()` now handles object-type array. .. ipython:: python From 872de737b7ad922a4f8aad082412dcd3766ba526 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 17 Feb 2018 01:00:27 +0900 Subject: [PATCH 20/32] fix pydap --- xarray/backends/pydap_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index c9980529103..de79203c207 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -38,7 +38,7 @@ def __getitem__(self, key): result = np.squeeze(result, axis) if len(np_inds.tuple) > 0: - result = indexing.NumpyIndexingAdapter(result)[ind] + result = indexing.NumpyIndexingAdapter(result)[np_inds] return result From bb5d1f6a894299c5e539c00d80f71e34e3625002 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 17 Feb 2018 01:14:46 +0900 Subject: [PATCH 21/32] Update comments. --- xarray/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 91abc476bf5..bd15da1b7a8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -737,7 +737,7 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): Returns ------- backend_indexer: OuterIndexer or BasicIndexer - np_indexers: a sequence of vectorizedIndexer + np_indexers: an ExplicitIndexer (VectorizedIndexer / BasicIndexer) Note ---- @@ -809,7 +809,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): Returns ------- backend_indexer: OuterIndexer or BasicIndexer - np_indexers: a sequence of OuterIndexer + np_indexers: an ExplicitIndexer (OuterIndexer / BasicIndexer) Note ---- From cfe29bbe2b8671e7fb47b493ef094050b8945b34 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 00:33:03 +0900 Subject: [PATCH 22/32] Support VectorizedIndexing for rasterio. Some bugfix. --- xarray/backends/rasterio_.py | 66 +++++++++++++++++++++++------------ xarray/core/indexing.py | 60 ++++++++++++++++++++++--------- xarray/tests/test_backends.py | 63 ++++++++++++++++++++------------- xarray/tests/test_indexing.py | 27 ++++++++++++-- 4 files changed, 151 insertions(+), 65 deletions(-) diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 4534edfb388..e4121a5c768 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -40,50 +40,72 @@ def dtype(self): def shape(self): return self._shape - def __getitem__(self, key): - # TODO support indexing.decompose_indexer - if isinstance(key, indexing.VectorizedIndexer): - raise NotImplementedError - key = key.tuple + def _get_indexer(self, key): + """ Get indexer for rasterio array. + + Parameter + --------- + key: ExplicitIndexer + + Returns + ------- + band_key: an indexer for the 1st dimension + window: two tuples. Each consists of (start, stop). + squeeze_axis: axes to be squeezed + np_ind: indexer for loaded numpy array + + See also + -------- + indexing.decompose_indexer + """ + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.OUTER) # bands cannot be windowed but they can be listed - band_key = key[0] - n_bands = self.shape[0] + band_key = key.tuple[0] + new_shape = [] + np_inds2 = [] + # bands (axis=0) cannot be windowed but they can be listed if isinstance(band_key, slice): - start, stop, step = band_key.indices(n_bands) - if step is not None and step != 1: - raise IndexError(_ERROR_MSG) - band_key = np.arange(start, stop) + start, stop, step = band_key.indices(self.shape[0]) + band_key = np.arange(start, stop, step) # be sure we give out a list band_key = (np.asarray(band_key) + 1).tolist() + if hasattr(band_key, '__len__'): # if band_key is not a scalar + new_shape.append(len(band_key)) + np_inds2.append(slice(None)) # but other dims can only be windowed window = [] squeeze_axis = [] - for i, (k, n) in enumerate(zip(key[1:], self.shape[1:])): + for i, (k, n) in enumerate(zip(key.tuple[1:], self.shape[1:])): if isinstance(k, slice): start, stop, step = k.indices(n) - if step is not None and step != 1: - raise IndexError(_ERROR_MSG) + np_inds2.append(slice(None, None, step)) + new_shape.append(stop - start) elif is_scalar(k): # windowed operations will always return an array # we will have to squeeze it later - squeeze_axis.append(i + 1) + squeeze_axis.append(- (2 - i)) start = k stop = k + 1 else: - k = np.asarray(k) - start = k[0] - stop = k[-1] + 1 - ids = np.arange(start, stop) - if not ((k.shape == ids.shape) and np.all(k == ids)): - raise IndexError(_ERROR_MSG) + start, stop = np.min(k), np.max(k) + 1 + np_inds2.append(k - start) + new_shape.append(stop - start) window.append((start, stop)) + np_inds = indexing._combine_indexers( + indexing.OuterIndexer(tuple(np_inds2)), new_shape, np_inds) + return band_key, window, tuple(squeeze_axis), np_inds + + def __getitem__(self, key): + band_key, window, squeeze_axis, np_inds = self._get_indexer(key) + out = self.rasterio_ds.read(band_key, window=tuple(window)) if squeeze_axis: out = np.squeeze(out, axis=squeeze_axis) - return out + return indexing.NumpyIndexingAdapter(out)[np_inds] def _parse_envi(meta): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index bd15da1b7a8..a0a8ef26159 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -691,13 +691,29 @@ def _outer_to_numpy_indexer(key, shape): return _outer_to_vectorized_indexer(key, shape).tuple -def _combine_indexers(old_key, old_shape, new_key): - """ Combine two indexers. old_key should be a VectorizedIndexer consisting - of all np.ndarray, while new_key can be all the indxer types.""" +def _combine_indexers(old_key, shape, new_key): + """ Combine two indexers. + + Parameters + ---------- + old_key: ExplicitIndexer + The first indexer for the original array + shape: tuple of ints + Shape of the original array to be indexed by old_key + new_key: + The second indexer for indexing original[old_key] + """ + if not isinstance(old_key, VectorizedIndexer): + old_key = _outer_to_vectorized_indexer(old_key, shape) + if len(old_key.tuple) == 0: + return new_key + + new_shape = np.broadcast(*old_key.tuple).shape if isinstance(new_key, VectorizedIndexer): - new_key = _arrayize_vectorized_indexer(new_key, old_shape) + new_key = _arrayize_vectorized_indexer(new_key, new_shape) else: - new_key = _outer_to_vectorized_indexer(new_key, old_shape) + new_key = _outer_to_vectorized_indexer(new_key, new_shape) + return VectorizedIndexer(tuple(o[new_key.tuple] for o in np.broadcast_arrays(*old_key.tuple))) @@ -786,12 +802,9 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): # If the backend does not support outer indexing, # backend_indexer (OuterIndexer) is also decomposed. - shape = tuple(s if isinstance(k, slice) else len(k) for k, s in - zip(backend_indexer.tuple, shape)) backend_indexer, np_indexer1 = _decompose_outer_indexer( backend_indexer, shape, indexing_support) - np_indexer = _combine_indexers( - _outer_to_vectorized_indexer(np_indexer1, shape), shape, np_indexer) + np_indexer = _combine_indexers(np_indexer1, shape, np_indexer) return backend_indexer, np_indexer @@ -826,15 +839,22 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): >>> np_indexer = OuterIndexer([0, 2, 1], [0, 1, 0]) >>> array[np_indexer] # outer indexing for on-memory np.ndarray. """ - if indexing_support in [IndexingSupport.OUTER, IndexingSupport.VECTORIZED]: + if indexing_support == IndexingSupport.VECTORIZED: return indexer, BasicIndexer(()) assert isinstance(indexer, OuterIndexer) backend_indexer = [] np_indexer = [] - # posify - indexer = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k - for k, s in zip(indexer.tuple, shape)] + # make indexer positive + pos_indexer = [] + for k, s in zip(indexer.tuple, shape): + if isinstance(k, np.ndarray): + pos_indexer.append(np.where(k < 0, k + s, k)) + elif isinstance(k, integer_types) and k < 0: + pos_indexer.append(k + s) + else: + pos_indexer.append(k) + indexer = pos_indexer if indexing_support is IndexingSupport.OUTER_1VECTOR: # some backends such as h5py supports only 1 vector in indexers @@ -854,6 +874,8 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): pkey, ekey = np.unique(k, return_inverse=True) backend_indexer.append(pkey) np_indexer.append(ekey) + elif isinstance(k, integer_types): + backend_indexer.append(k) else: # If it is a slice or an integer, then we will slice it # as-is (k) in the backend, and then use all of it @@ -866,10 +888,12 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): if indexing_support == IndexingSupport.OUTER: for k in indexer: - if (isinstance(k, integer_types + (slice, )) or - (np.diff(k) >= 0).all()): + if (isinstance(k, slice) or + (isinstance(k, np.ndarray) and (np.diff(k) >= 0).all())): backend_indexer.append(k) np_indexer.append(slice(None)) + elif isinstance(k, integer_types): + backend_indexer.append(k) else: # Remove duplicates and sort them in the increasing order oind, vind = np.unique(k, return_inverse=True) @@ -880,13 +904,15 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): OuterIndexer(tuple(np_indexer))) # basic - for i, k in enumerate(indexer): + for k in indexer: if isinstance(k, np.ndarray): # np.ndarray key is converted to slice that covers the entire # entries of this key. backend_indexer.append(slice(np.min(k), np.max(k) + 1)) np_indexer.append(k - np.min(k)) - else: # integer or slice + elif isinstance(k, integer_types): + backend_indexer.append(k) + else: # slice backend_indexer.append(k) np_indexer.append(slice(None)) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2ec7e407e89..cb23c800a6f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2327,17 +2327,46 @@ def test_indexing(self): # tests # assert_allclose checks all data + coordinates assert_allclose(actual, expected) + assert not actual.variable._in_memory + + # Basic indexer + ind = {'x': slice(2, 5), 'y': slice(5, 7)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': slice(1, 2), 'x': slice(2, 5), 'y': slice(5, 7)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': slice(1, 2), 'x': slice(2, 5), 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + # orthogonal indexer + ind = {'band': np.array([2, 1, 0]), + 'x': np.array([1, 0]), 'y': np.array([0, 2])} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': np.array([2, 1, 0]), + 'x': np.array([1, 0]), 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + # vectorized indexer + ind = {'band': DataArray([2, 1, 0], dims='a'), + 'x': DataArray([1, 0, 0], dims='a'), + 'y': np.array([0, 2])} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = { + 'band': DataArray([[2, 1, 0], [1, 0, 2]], dims=['a', 'b']), + 'x': DataArray([[1, 0, 0], [0, 1, 0]], dims=['a', 'b']), + 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory - # Slicing - ex = expected.isel(x=slice(2, 5), y=slice(5, 7)) - ac = actual.isel(x=slice(2, 5), y=slice(5, 7)) - assert_allclose(ac, ex) - - ex = expected.isel(band=slice(1, 2), x=slice(2, 5), - y=slice(5, 7)) - ac = actual.isel(band=slice(1, 2), x=slice(2, 5), - y=slice(5, 7)) - assert_allclose(ac, ex) # Selecting lists of bands is fine ex = expected.isel(band=[1, 2]) @@ -2347,15 +2376,6 @@ def test_indexing(self): ac = actual.isel(band=[0, 2]) assert_allclose(ac, ex) - # but on x and y only windowed operations are allowed, more - # exotic slicing should raise an error - err_msg = 'not valid on rasterio' - with raises_regex(IndexError, err_msg): - actual.isel(x=[2, 4], y=[1, 3]).values - with raises_regex(IndexError, err_msg): - actual.isel(x=[4, 2]).values - with raises_regex(IndexError, err_msg): - actual.isel(x=slice(5, 2, -1)).values # Integer indexing ex = expected.isel(band=1) ac = actual.isel(band=1) @@ -2393,11 +2413,6 @@ def test_caching(self): # Cache is the default with xr.open_rasterio(tmp_file) as actual: - # Without cache an error is raised - err_msg = 'not valid on rasterio' - with raises_regex(IndexError, err_msg): - actual.isel(x=[2, 4]).values - # This should cache everything assert_allclose(actual, expected) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 61f3abf48d3..8abe89224d4 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -402,19 +402,37 @@ def get_indexers(shape, mode): indexer = tuple(np.random.randint(0, s, s + 2) for s in shape) return indexing.OuterIndexer(indexer) + elif mode == 'outer_scalar': + indexer = (np.random.randint(0, 3, 4), 0, slice(None, None, 2)) + return indexing.OuterIndexer(indexer[:len(shape)]) + + elif mode == 'outer_scalar2': + indexer = (np.random.randint(0, 3, 4), -2, slice(None, None, 2)) + return indexing.OuterIndexer(indexer[:len(shape)]) + elif mode == 'outer1vec': indexer = [slice(2, -3) for s in shape] indexer[1] = np.random.randint(0, shape[1], shape[1] + 2) return indexing.OuterIndexer(tuple(indexer)) - else: # basic indexer + elif mode == 'basic': # basic indexer indexer = [slice(2, -3) for s in shape] indexer[0] = 3 return indexing.BasicIndexer(tuple(indexer)) + elif mode == 'basic1': # basic indexer + return indexing.BasicIndexer((3, )) + + elif mode == 'basic2': # basic indexer + indexer = [0, 2, 4] + return indexing.BasicIndexer(tuple(indexer[:len(shape)])) + @pytest.mark.parametrize('shape', [(10, 5, 8), (10, 3)]) -@pytest.mark.parametrize('indexer_mode', ['vectorized', 'outer', 'outer1vec']) +@pytest.mark.parametrize('indexer_mode', + ['vectorized', 'outer', 'outer_scalar', + 'outer_scalar2', 'outer1vec', + 'basic', 'basic1']) @pytest.mark.parametrize('indexing_support', [indexing.IndexingSupport.BASIC, indexing.IndexingSupport.OUTER, @@ -433,6 +451,11 @@ def test_decompose_indexers(shape, indexer_mode, indexing_support): array = indexing.NumpyIndexingAdapter(array)[np_ind] np.testing.assert_array_equal(expected, array) + if not all(isinstance(k, indexing.integer_types) for k in np_ind.tuple): + combined_ind = indexing._combine_indexers(backend_ind, shape, np_ind) + array = indexing.NumpyIndexingAdapter(data)[combined_ind] + np.testing.assert_array_equal(expected, array) + def test_implicit_indexing_adapter(): array = np.arange(10) From 2dff2784f0b6bec731a8cbfdcaa83c8d39013e60 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 11:32:43 +0900 Subject: [PATCH 23/32] flake8 --- xarray/tests/test_backends.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cb23c800a6f..90ea439365c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2367,7 +2367,6 @@ def test_indexing(self): assert_allclose(expected.isel(**ind), actual.isel(**ind)) assert not actual.variable._in_memory - # Selecting lists of bands is fine ex = expected.isel(band=[1, 2]) ac = actual.isel(band=[1, 2]) From ead6327ec7ec6d47a8321ea60efded5896556425 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 13:25:36 +0900 Subject: [PATCH 24/32] More tests --- xarray/core/indexing.py | 4 ++ xarray/tests/test_backends.py | 69 ++++++++++++++++++++++------------- 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index a0a8ef26159..ac6398d7cc0 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -543,6 +543,10 @@ def _updated_key(self, new_key): return _combine_indexers(self.key, self.shape, new_key) def __getitem__(self, indexer): + # load if the indexed array becomes a scalar + if all(isinstance(ind, integer_types) for ind in indexer.tuple): + key = BasicIndexer(tuple(k[indexer.tuple] for k in self.key.tuple)) + return np.asarray(self.array[key], dtype=None) return type(self)(self.array, self._updated_key(indexer)) def transpose(self, order): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 90ea439365c..2f9c721add3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -428,32 +428,49 @@ def test_vectorized_indexing(self): actual = on_disk.isel(**indexers) assert_identical(expected, actual) - # make sure lazy indexing certainly works. - with self.roundtrip(in_memory) as on_disk: - indexers = { - 'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), - 'dim3': DataArray([[0, 4], [1, 3], [2, 2]], dims=['a', 'b']), - 'dim2': slice(None, 10)} - expected = in_memory.isel(**indexers)['var3'] - actual = on_disk.isel(**indexers)['var3'] - # make sure the array is not yet loaded into memory - assert not actual.variable._in_memory - indexers = {'a': DataArray([0, 1], dims=['c']), - 'b': DataArray([0, 1], dims=['c'])} - expected = expected.isel(**indexers) - actual = actual.isel(**indexers) - assert not actual.variable._in_memory - assert_identical(expected, actual.load()) - - with self.roundtrip(in_memory) as on_disk: - indexers = { - 'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), - 'dim2': slice(None, 10)} - expected = in_memory.isel(**indexers)['var3'] - actual = on_disk.isel(**indexers)['var3'] - # make sure the array is not yet loaded into memory - assert not actual.variable._in_memory - assert_identical(expected, actual.load()) + def multiple_indexing(indexers): + # make sure a sequence of lazy indexings certainly works. + with self.roundtrip(in_memory) as on_disk: + actual = on_disk['var3'] + expected = in_memory['var3'] + for ind in indexers: + actual = actual.isel(**ind) + expected = expected.isel(**ind) + # make sure the array is not yet loaded into memory + assert not actual.variable._in_memory + assert_identical(expected, actual.load()) + + # two-staged vectorized-indexing + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': DataArray([[0, 4], [1, 3], [2, 2]], dims=['a', 'b'])}, + {'a': DataArray([0, 1], dims=['c']), + 'b': DataArray([0, 1], dims=['c'])} + ] + multiple_indexing(indexers) + + # vectorized-slice mixed + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': slice(None, 10)} + ] + multiple_indexing(indexers) + + # vectorized-integer mixed + indexers = [ + {'dim3': 0}, + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b'])}, + {'a': slice(None, None, 2)} + ] + multiple_indexing(indexers) + + # vectorized-integer mixed + indexers = [ + {'dim3': 0}, + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b'])}, + {'a': 1, 'b': 0} + ] + multiple_indexing(indexers) def test_isel_dataarray(self): # Make sure isel works lazily. GH:issue:1688 From a90ac052a330ae5cee8c8c09c1686b31f3ebfe2b Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 16:18:09 +0900 Subject: [PATCH 25/32] Use LazilyIndexedArray for scalar array instead of loading. --- xarray/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index ac6398d7cc0..83d1f3276b4 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -543,10 +543,10 @@ def _updated_key(self, new_key): return _combine_indexers(self.key, self.shape, new_key) def __getitem__(self, indexer): - # load if the indexed array becomes a scalar + # If the indexed array becomes a scalar, return LazilyIndexedArray. if all(isinstance(ind, integer_types) for ind in indexer.tuple): key = BasicIndexer(tuple(k[indexer.tuple] for k in self.key.tuple)) - return np.asarray(self.array[key], dtype=None) + return LazilyIndexedArray(self.array, key) return type(self)(self.array, self._updated_key(indexer)) def transpose(self, order): From 73f495821c2d9eee98d2079e87e8a5e0f886304d Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 22:18:54 +0900 Subject: [PATCH 26/32] Support negative step slice in rasterio. --- xarray/backends/rasterio_.py | 8 ++++++-- xarray/tests/test_backends.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index e4121a5c768..c17a838d1e7 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -71,7 +71,7 @@ def _get_indexer(self, key): band_key = np.arange(start, stop, step) # be sure we give out a list band_key = (np.asarray(band_key) + 1).tolist() - if hasattr(band_key, '__len__'): # if band_key is not a scalar + if isinstance(band_key, list): # if band_key is not a scalar new_shape.append(len(band_key)) np_inds2.append(slice(None)) @@ -81,7 +81,11 @@ def _get_indexer(self, key): for i, (k, n) in enumerate(zip(key.tuple[1:], self.shape[1:])): if isinstance(k, slice): start, stop, step = k.indices(n) - np_inds2.append(slice(None, None, step)) + if step > 0: + np_inds2.append(slice(None, None, step)) + else: + start, stop = stop + 1, start + 1 + np_inds2.append(slice(-1, None, step)) new_shape.append(stop - start) elif is_scalar(k): # windowed operations will always return an array diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2f9c721add3..144e82c09c3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -472,6 +472,13 @@ def multiple_indexing(indexers): ] multiple_indexing(indexers) + # with negative step slice. + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': slice(-1, 1, -2)}, + ] + multiple_indexing(indexers) + def test_isel_dataarray(self): # Make sure isel works lazily. GH:issue:1688 in_memory = create_test_data() @@ -2370,6 +2377,23 @@ def test_indexing(self): assert_allclose(expected.isel(**ind), actual.isel(**ind)) assert not actual.variable._in_memory + # minus-stepped slice + ind = {'band': np.array([2, 1, 0]), + 'x': slice(-1, None, -1), 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': np.array([2, 1, 0]), + 'x': 1, 'y': slice(-1, 1, -2)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + # None is selected + ind = {'band': np.array([2, 1, 0]), + 'x': 1, 'y': slice(2, 2, 1)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + # vectorized indexer ind = {'band': DataArray([2, 1, 0], dims='a'), 'x': DataArray([1, 0, 0], dims='a'), From fd0496678d13ec9fd69cdc7e5c4825039037bdbe Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 19 Feb 2018 08:31:06 +0900 Subject: [PATCH 27/32] Make slice-step always positive --- xarray/backends/rasterio_.py | 7 ++-- xarray/core/indexing.py | 67 ++++++++++++++++++++++------------- xarray/tests/test_backends.py | 7 ++++ xarray/tests/test_indexing.py | 41 +++++++++++++++------ 4 files changed, 82 insertions(+), 40 deletions(-) diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index c17a838d1e7..e5adbb20773 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -80,12 +80,9 @@ def _get_indexer(self, key): squeeze_axis = [] for i, (k, n) in enumerate(zip(key.tuple[1:], self.shape[1:])): if isinstance(k, slice): + # step is always positive. see indexing.decompose_indexer start, stop, step = k.indices(n) - if step > 0: - np_inds2.append(slice(None, None, step)) - else: - start, stop = stop + 1, start + 1 - np_inds2.append(slice(-1, None, step)) + np_inds2.append(slice(None, None, step)) new_shape.append(stop - start) elif is_scalar(k): # windowed operations will always return an array diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 83d1f3276b4..a37c82eab4f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -736,13 +736,27 @@ class IndexingSupport(object): # could inherit from enum.Enum on Python 3 def decompose_indexer(indexer, shape, indexing_support): if isinstance(indexer, VectorizedIndexer): return _decompose_vectorized_indexer(indexer, shape, indexing_support) - if isinstance(indexer, OuterIndexer): + if isinstance(indexer, (BasicIndexer, OuterIndexer)): return _decompose_outer_indexer(indexer, shape, indexing_support) - if isinstance(indexer, BasicIndexer): - return indexer, BasicIndexer(()) raise TypeError('unexpected key type: {}'.format(indexer)) +def _decompose_slice(key, size): + """ convert a slice to successive two slices. The first slice always has + a positive step. + """ + start, stop, step = key.indices(size) + if step > 0: + # If key already has a positive step, use it as is in the backend + return key, slice(None) + else: + # determine stop precisely for step > 1 case + # e.g. [98:2:-2] -> [98:3:-2] + stop = start + int((stop - start - 1) / step) * step + 1 + start, stop = stop + 1, start + 1 + return slice(start, stop, -step), slice(None, None, -1) + + def _decompose_vectorized_indexer(indexer, shape, indexing_support): """ Decompose vectorized indexer to the successive two indexers, where the @@ -785,12 +799,14 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): indexer = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k for k, s in zip(indexer.tuple, shape)] - for k in indexer: + for k, s in zip(indexer, shape): if isinstance(k, slice): - # If it is a slice, then we will slice it as-is (k) in the backend, + # If it is a slice, then we will slice it as-is + # (but make its step positive) in the backend, # and then use all of it (slice(None)) for the in-memory portion. - backend_indexer.append(k) - np_indexer.append(slice(None)) + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) else: # If it is a (multidimensional) np.ndarray, just pickup the used # keys without duplication and store them as a 1d-np.ndarray. @@ -845,7 +861,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): """ if indexing_support == IndexingSupport.VECTORIZED: return indexer, BasicIndexer(()) - assert isinstance(indexer, OuterIndexer) + assert isinstance(indexer, (OuterIndexer, BasicIndexer)) backend_indexer = [] np_indexer = [] @@ -867,7 +883,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): if isinstance(k, np.ndarray) else 0 for k in indexer] array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None - for i, k in enumerate(indexer): + for i, (k, s) in enumerate(zip(indexer, shape)): if isinstance(k, np.ndarray) and i != array_index: # np.ndarray key is converted to slice that covers the entire # entries of this key. @@ -880,24 +896,26 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): np_indexer.append(ekey) elif isinstance(k, integer_types): backend_indexer.append(k) - else: - # If it is a slice or an integer, then we will slice it - # as-is (k) in the backend, and then use all of it - # slice(None)) for the in-memory portion. - backend_indexer.append(k) - np_indexer.append(slice(None)) + else: # slice: convert positive step slice for backend + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) return (OuterIndexer(tuple(backend_indexer)), OuterIndexer(tuple(np_indexer))) if indexing_support == IndexingSupport.OUTER: - for k in indexer: - if (isinstance(k, slice) or - (isinstance(k, np.ndarray) and (np.diff(k) >= 0).all())): - backend_indexer.append(k) - np_indexer.append(slice(None)) + for k, s in zip(indexer, shape): + if isinstance(k, slice): + # slice: convert positive step slice for backend + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) elif isinstance(k, integer_types): backend_indexer.append(k) + elif isinstance(k, np.ndarray) and (np.diff(k) >= 0).all(): + backend_indexer.append(k) + np_indexer.append(slice(None)) else: # Remove duplicates and sort them in the increasing order oind, vind = np.unique(k, return_inverse=True) @@ -908,7 +926,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): OuterIndexer(tuple(np_indexer))) # basic - for k in indexer: + for k, s in zip(indexer, shape): if isinstance(k, np.ndarray): # np.ndarray key is converted to slice that covers the entire # entries of this key. @@ -916,9 +934,10 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): np_indexer.append(k - np.min(k)) elif isinstance(k, integer_types): backend_indexer.append(k) - else: # slice - backend_indexer.append(k) - np_indexer.append(slice(None)) + else: # slice: convert positive step slice for backend + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) return (BasicIndexer(tuple(backend_indexer)), OuterIndexer(tuple(np_indexer))) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 144e82c09c3..6952edc4d63 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -472,6 +472,13 @@ def multiple_indexing(indexers): ] multiple_indexing(indexers) + # with negative step slice. + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': slice(-1, 1, -1)}, + ] + multiple_indexing(indexers) + # with negative step slice. indexers = [ {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 8abe89224d4..39172c73be9 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -139,16 +139,18 @@ def test_indexer(data, x, expected_pos, expected_idx=None): class TestLazyArray(TestCase): def test_slice_slice(self): I = ReturnItem() # noqa: E741 # allow ambiguous name - x = np.arange(100) - slices = [I[:3], I[:4], I[2:4], I[:1], I[:-1], I[5:-1], I[-5:-1], - I[::-1], I[5::-1], I[:3:-1], I[:30:-1], I[10:4:], I[::4], - I[4:4:4], I[:4:-4]] - for i in slices: - for j in slices: - expected = x[i][j] - new_slice = indexing.slice_slice(i, j, size=100) - actual = x[new_slice] - assert_array_equal(expected, actual) + for size in [100, 99]: + # We test even/odd size cases + x = np.arange(size) + slices = [I[:3], I[:4], I[2:4], I[:1], I[:-1], I[5:-1], I[-5:-1], + I[::-1], I[5::-1], I[:3:-1], I[:30:-1], I[10:4:], I[::4], + I[4:4:4], I[:4:-4]] + for i in slices: + for j in slices: + expected = x[i][j] + new_slice = indexing.slice_slice(i, j, size=100) + actual = x[new_slice] + assert_array_equal(expected, actual) def test_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) @@ -427,12 +429,29 @@ def get_indexers(shape, mode): indexer = [0, 2, 4] return indexing.BasicIndexer(tuple(indexer[:len(shape)])) + elif mode == 'basic3': # basic indexer + indexer = [slice(None) for s in shape] + indexer[0] = slice(-2, 2, -2) + indexer[1] = slice(1, -1, 2) + return indexing.BasicIndexer(tuple(indexer[:len(shape)])) + + +@pytest.mark.parametrize('size', [100, 99]) +@pytest.mark.parametrize('sl', [slice(1, -1, 1), slice(None, -1, 2), + slice(-1, 1, -1), slice(-1, 1, -2)]) +def test_decompose_slice(size, sl): + x = np.arange(size) + slice1, slice2 = indexing._decompose_slice(sl, size) + expected = x[sl] + actual = x[slice1][slice2] + assert_array_equal(expected, actual) + @pytest.mark.parametrize('shape', [(10, 5, 8), (10, 3)]) @pytest.mark.parametrize('indexer_mode', ['vectorized', 'outer', 'outer_scalar', 'outer_scalar2', 'outer1vec', - 'basic', 'basic1']) + 'basic', 'basic1', 'basic2', 'basic3']) @pytest.mark.parametrize('indexing_support', [indexing.IndexingSupport.BASIC, indexing.IndexingSupport.OUTER, From b3c3d804afb9e2e163cfa27186ed6f6b058087f9 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 25 Feb 2018 11:42:12 +0900 Subject: [PATCH 28/32] Bugfix in slice-slice --- xarray/core/indexing.py | 2 +- xarray/tests/test_indexing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index a37c82eab4f..60a5342ac0e 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -256,7 +256,7 @@ def slice_slice(old_slice, applied_slice, size): items = _expand_slice(old_slice, size)[applied_slice] if len(items) > 0: start = items[0] - stop = items[-1] + step + stop = items[-1] + int(np.sign(step)) if stop < 0: stop = None else: diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 39172c73be9..691020151b5 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -144,11 +144,11 @@ def test_slice_slice(self): x = np.arange(size) slices = [I[:3], I[:4], I[2:4], I[:1], I[:-1], I[5:-1], I[-5:-1], I[::-1], I[5::-1], I[:3:-1], I[:30:-1], I[10:4:], I[::4], - I[4:4:4], I[:4:-4]] + I[4:4:4], I[:4:-4], I[::-2]] for i in slices: for j in slices: expected = x[i][j] - new_slice = indexing.slice_slice(i, j, size=100) + new_slice = indexing.slice_slice(i, j, size=size) actual = x[new_slice] assert_array_equal(expected, actual) From 259f36ce69b0e1115c5add199bb0b229099c3485 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 25 Feb 2018 20:51:57 +0900 Subject: [PATCH 29/32] Add pydap support. --- xarray/backends/pydap_.py | 2 +- xarray/tests/test_backends.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index de79203c207..f1a72914c03 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -38,7 +38,7 @@ def __getitem__(self, key): result = np.squeeze(result, axis) if len(np_inds.tuple) > 0: - result = indexing.NumpyIndexingAdapter(result)[np_inds] + result = indexing.NumpyIndexingAdapter(np.asarray(result))[np_inds] return result diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6952edc4d63..1abe56ad064 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2139,6 +2139,17 @@ def test_cmp_local_file(self): assert_equal(actual.isel(j=slice(1, 2)), expected.isel(j=slice(1, 2))) + with self.create_datasets() as (actual, expected): + indexers = {'i': [1, 0, 0], 'j': [1, 2, 0, 1]} + assert_equal(actual.isel(**indexers), + expected.isel(**indexers)) + + with self.create_datasets() as (actual, expected): + indexers = {'i': DataArray([0, 1, 0], dims='a'), + 'j': DataArray([0, 2, 1], dims='a')} + assert_equal(actual.isel(**indexers), + expected.isel(**indexers)) + def test_compatible_to_netcdf(self): # make sure it can be saved as a netcdf with self.create_datasets() as (actual, expected): From 7e0959cf0d8f0c64a39f7a27ec0fe453d282349f Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 2 Mar 2018 23:18:40 +0900 Subject: [PATCH 30/32] Rename LazilyIndexedArray -> LazilyOuterIndexedArray. Remove duplicate in zarr.py --- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/netCDF4_.py | 2 +- xarray/backends/pydap_.py | 2 +- xarray/backends/pynio_.py | 2 +- xarray/backends/rasterio_.py | 2 +- xarray/backends/zarr.py | 30 +++------------------------ xarray/conventions.py | 2 +- xarray/core/indexing.py | 12 ++++++----- xarray/core/variable.py | 2 +- xarray/tests/test_backends.py | 21 ------------------- xarray/tests/test_dataset.py | 2 +- xarray/tests/test_indexing.py | 39 ++++++++++++++++++++++++++++------- xarray/tests/test_variable.py | 18 ++++++++-------- 13 files changed, 58 insertions(+), 78 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 42149638a80..1d166f05eb1 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -90,7 +90,7 @@ def __init__(self, filename, mode='r', format=None, group=None, def open_store_variable(self, name, var): with self.ensure_open(autoclose=False): dimensions = var.dimensions - data = indexing.LazilyIndexedArray( + data = indexing.LazilyOuterIndexedArray( H5NetCDFArrayWrapper(name, self)) attrs = _read_attributes(var) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 915f3e695fc..666ec84d875 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -279,7 +279,7 @@ def open(cls, filename, mode='r', format='NETCDF4', group=None, def open_store_variable(self, name, var): with self.ensure_open(autoclose=False): dimensions = var.dimensions - data = indexing.LazilyIndexedArray(NetCDF4ArrayWrapper(name, self)) + data = indexing.LazilyOuterIndexedArray(NetCDF4ArrayWrapper(name, self)) attributes = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()) _ensure_fill_value_valid(data, attributes) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index ed64d13110a..4a932e3dad2 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -78,7 +78,7 @@ def open(cls, url, session=None): return cls(ds) def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) + data = indexing.LazilyOuterIndexedArray(PydapArrayWrapper(var)) return Variable(var.dimensions, data, _fix_attributes(var.attributes)) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 378a0e0bfd0..95226e453b4 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -56,7 +56,7 @@ def __init__(self, filename, mode='r', autoclose=False): self._mode = mode def open_store_variable(self, name, var): - data = indexing.LazilyIndexedArray(NioArrayWrapper(name, self)) + data = indexing.LazilyOuterIndexedArray(NioArrayWrapper(name, self)) return Variable(var.dimensions, data, var.attributes) def get_variables(self): diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index c5e59cb18e3..f856f9f9e13 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -274,7 +274,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, else: attrs[k] = v - data = indexing.LazilyIndexedArray(RasterioArrayWrapper(riods)) + data = indexing.LazilyOuterIndexedArray(RasterioArrayWrapper(riods)) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b0323b51f17..3058e935238 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -41,30 +41,6 @@ def _ensure_valid_fill_value(value, dtype): return _encode_zarr_attr_value(valid) -def _replace_slices_with_arrays(key, shape): - """Replace slice objects in vindex with equivalent ndarray objects.""" - num_slices = sum(1 for k in key if isinstance(k, slice)) - ndims = [k.ndim for k in key if isinstance(k, np.ndarray)] - array_subspace_size = max(ndims) if ndims else 0 - assert len(key) == len(shape) - new_key = [] - slice_count = 0 - for k, size in zip(key, shape): - if isinstance(k, slice): - # the slice subspace always appears after the ndarray subspace - array = np.arange(*k.indices(size)) - sl = [np.newaxis] * len(shape) - sl[array_subspace_size + slice_count] = slice(None) - k = array[tuple(sl)] - slice_count += 1 - else: - assert isinstance(k, np.ndarray) - k = k[(slice(None),) * array_subspace_size + - (np.newaxis,) * num_slices] - new_key.append(k) - return tuple(new_key) - - class ZarrArrayWrapper(BackendArray): def __init__(self, variable_name, datastore): self.datastore = datastore @@ -84,8 +60,8 @@ def __getitem__(self, key): if isinstance(key, indexing.BasicIndexer): return array[key.tuple] elif isinstance(key, indexing.VectorizedIndexer): - return array.vindex[_replace_slices_with_arrays(key.tuple, - self.shape)] + return array.vindex[indexing._arrayize_vectorized_indexer( + key.tuple, self.shape).tuple] else: assert isinstance(key, indexing.OuterIndexer) return array.oindex[key.tuple] @@ -292,7 +268,7 @@ def __init__(self, zarr_group, writer=None): super(ZarrStore, self).__init__(zarr_writer) def open_store_variable(self, name, zarr_array): - data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) + data = indexing.LazilyOuterIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, _DIMENSION_KEY) attributes = OrderedDict(attributes) diff --git a/xarray/conventions.py b/xarray/conventions.py index 5bcbd83ee90..93fd56ed5d2 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -490,7 +490,7 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, del attributes['dtype'] data = BoolTypeArray(data) - return Variable(dimensions, indexing.LazilyIndexedArray(data), + return Variable(dimensions, indexing.LazilyOuterIndexedArray(data), attributes, encoding=encoding) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 9bac64dc5ba..f65586b720c 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -440,7 +440,7 @@ def __getitem__(self, key): return self.array[self.indexer_cls(key)] -class LazilyIndexedArray(ExplicitlyIndexedNDArrayMixin): +class LazilyOuterIndexedArray(ExplicitlyIndexedNDArrayMixin): """Wrap an array to make basic and orthogonal indexing lazy. """ @@ -541,10 +541,10 @@ def _updated_key(self, new_key): return _combine_indexers(self.key, self.shape, new_key) def __getitem__(self, indexer): - # If the indexed array becomes a scalar, return LazilyIndexedArray. + # If the indexed array becomes a scalar, return LazilyOuterIndexedArray. if all(isinstance(ind, integer_types) for ind in indexer.tuple): key = BasicIndexer(tuple(k[indexer.tuple] for k in self.key.tuple)) - return LazilyIndexedArray(self.array, key) + return LazilyOuterIndexedArray(self.array, key) return type(self)(self.array, self._updated_key(indexer)) def transpose(self, order): @@ -645,7 +645,7 @@ def _outer_to_vectorized_indexer(key, shape): Returns ------- - VectorizedInexer + VectorizedIndexer Tuple suitable for use to index a NumPy array with vectorized indexing. Each element is an array: broadcasting them together gives the shape of the result. @@ -923,7 +923,9 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): return (OuterIndexer(tuple(backend_indexer)), OuterIndexer(tuple(np_indexer))) - # basic + # basic indexer + assert indexing_support == IndexingSupport.BASIC + for k, s in zip(indexer, shape): if isinstance(k, np.ndarray): # np.ndarray key is converted to slice that covers the entire diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bb4285fba0a..bd9229c1ee3 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -121,7 +121,7 @@ def _maybe_wrap_data(data): Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure they can be indexed properly. - NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should + NumpyArrayAdapter, PandasIndexAdapter and LazilyOuterIndexedArray should all pass through unmodified. """ if isinstance(data, pd.Index): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 773df51188c..21152829096 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1402,27 +1402,6 @@ def create_zarr_target(self): yield tmp -def test_replace_slices_with_arrays(): - (actual,) = xr.backends.zarr._replace_slices_with_arrays( - key=(slice(None),), shape=(5,)) - np.testing.assert_array_equal(actual, np.arange(5)) - - actual = xr.backends.zarr._replace_slices_with_arrays( - key=(np.arange(5),) * 3, shape=(8, 10, 12)) - expected = np.stack([np.arange(5)] * 3) - np.testing.assert_array_equal(np.stack(actual), expected) - - a, b = xr.backends.zarr._replace_slices_with_arrays( - key=(np.arange(5), slice(None)), shape=(8, 10)) - np.testing.assert_array_equal(a, np.arange(5)[:, np.newaxis]) - np.testing.assert_array_equal(b, np.arange(10)[np.newaxis, :]) - - a, b = xr.backends.zarr._replace_slices_with_arrays( - key=(slice(None), np.arange(5)), shape=(8, 10)) - np.testing.assert_array_equal(a, np.arange(8)[np.newaxis, :]) - np.testing.assert_array_equal(b, np.arange(5)[:, np.newaxis]) - - @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, NetCDF3Only, TestCase): engine = 'scipy' diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 1b557479ec1..99ca51536f1 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -77,7 +77,7 @@ def get_variables(self): def lazy_inaccessible(k, v): if k in self._indexvars: return v - data = indexing.LazilyIndexedArray(InaccessibleArray(v.values)) + data = indexing.LazilyOuterIndexedArray(InaccessibleArray(v.values)) return Variable(v.dims, data, v.attrs) return dict((k, lazy_inaccessible(k, v)) for k, v in iteritems(self._variables)) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 850b1d85dd1..8a3e2ccbc99 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -153,7 +153,7 @@ def test_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) v = Variable(['i', 'j', 'k'], original) - lazy = indexing.LazilyIndexedArray(x) + lazy = indexing.LazilyOuterIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() # noqa: E741 # allow ambiguous name # test orthogonally applied indexers @@ -172,7 +172,7 @@ def test_lazily_indexed_array(self): assert expected.shape == actual.shape assert_array_equal(expected, actual) assert isinstance(actual._data, - indexing.LazilyIndexedArray) + indexing.LazilyOuterIndexedArray) # make sure actual.key is appropriate type if all(isinstance(k, native_int_types + (slice, )) @@ -191,7 +191,7 @@ def test_lazily_indexed_array(self): actual = v_lazy[i][j] assert expected.shape == actual.shape assert_array_equal(expected, actual) - assert isinstance(actual._data, indexing.LazilyIndexedArray) + assert isinstance(actual._data, indexing.LazilyOuterIndexedArray) assert isinstance(actual._data.array, indexing.NumpyIndexingAdapter) @@ -199,7 +199,7 @@ def test_vectorized_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) v_eager = Variable(['i', 'j', 'k'], x) - lazy = indexing.LazilyIndexedArray(x) + lazy = indexing.LazilyOuterIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() # noqa: E741 # allow ambiguous name @@ -210,7 +210,7 @@ def check_indexing(v_eager, v_lazy, indexers): assert expected.shape == actual.shape assert isinstance(actual._data, (indexing.LazilyVectorizedIndexedArray, - indexing.LazilyIndexedArray)) + indexing.LazilyOuterIndexedArray)) assert_array_equal(expected, actual) v_eager = expected v_lazy = actual @@ -263,19 +263,19 @@ def test_index_scalar(self): class TestMemoryCachedArray(TestCase): def test_wrapper(self): - original = indexing.LazilyIndexedArray(np.arange(10)) + original = indexing.LazilyOuterIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) assert_array_equal(wrapped, np.arange(10)) assert isinstance(wrapped.array, indexing.NumpyIndexingAdapter) def test_sub_array(self): - original = indexing.LazilyIndexedArray(np.arange(10)) + original = indexing.LazilyOuterIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) child = wrapped[B[:5]] assert isinstance(child, indexing.MemoryCachedArray) assert_array_equal(child, np.arange(5)) assert isinstance(child.array, indexing.NumpyIndexingAdapter) - assert isinstance(wrapped.array, indexing.LazilyIndexedArray) + assert isinstance(wrapped.array, indexing.LazilyOuterIndexedArray) def test_setitem(self): original = np.arange(10) @@ -389,6 +389,29 @@ def test_arrayize_vectorized_indexer(self): np.testing.assert_array_equal( self.data[vindex], self.data[vindex_array],) + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((slice(None),)), shape=(5,)) + np.testing.assert_array_equal(actual.tuple, [np.arange(5)]) + + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((np.arange(5),) * 3), shape=(8, 10, 12)) + expected = np.stack([np.arange(5)] * 3) + np.testing.assert_array_equal(np.stack(actual.tuple), expected) + + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((np.arange(5), slice(None))), + shape=(8, 10)) + a, b = actual.tuple + np.testing.assert_array_equal(a, np.arange(5)[:, np.newaxis]) + np.testing.assert_array_equal(b, np.arange(10)[np.newaxis, :]) + + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((slice(None), np.arange(5))), + shape=(8, 10)) + a, b = actual.tuple + np.testing.assert_array_equal(a, np.arange(8)[np.newaxis, :]) + np.testing.assert_array_equal(b, np.arange(5)[:, np.newaxis]) + def get_indexers(shape, mode): if mode == 'vectorized': diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 29b09c59627..2f5c617e75b 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -16,7 +16,7 @@ from xarray.core import indexing from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( - BasicIndexer, CopyOnWriteArray, DaskIndexingAdapter, LazilyIndexedArray, + BasicIndexer, CopyOnWriteArray, DaskIndexingAdapter, LazilyOuterIndexedArray, MemoryCachedArray, NumpyIndexingAdapter, OuterIndexer, PandasIndexAdapter, VectorizedIndexer) from xarray.core.pycompat import PY3, OrderedDict @@ -988,9 +988,9 @@ def test_repr(self): assert expected == repr(v) def test_repr_lazy_data(self): - v = Variable('x', LazilyIndexedArray(np.arange(2e5))) + v = Variable('x', LazilyOuterIndexedArray(np.arange(2e5))) assert '200000 values with dtype' in repr(v) - assert isinstance(v._data, LazilyIndexedArray) + assert isinstance(v._data, LazilyOuterIndexedArray) def test_detect_indexer_type(self): """ Tests indexer type was correctly detected. """ @@ -1798,7 +1798,7 @@ def test_rolling_window(self): class TestAsCompatibleData(TestCase): def test_unchanged_types(self): - types = (np.asarray, PandasIndexAdapter, indexing.LazilyIndexedArray) + types = (np.asarray, PandasIndexAdapter, indexing.LazilyOuterIndexedArray) for t in types: for data in [np.arange(3), pd.date_range('2000-01-01', periods=3), @@ -1961,17 +1961,17 @@ def test_NumpyIndexingAdapter(self): v = Variable(dims=('x', 'y'), data=NumpyIndexingAdapter( NumpyIndexingAdapter(self.d))) - def test_LazilyIndexedArray(self): - v = Variable(dims=('x', 'y'), data=LazilyIndexedArray(self.d)) + def test_LazilyOuterIndexedArray(self): + v = Variable(dims=('x', 'y'), data=LazilyOuterIndexedArray(self.d)) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping v = Variable(dims=('x', 'y'), - data=LazilyIndexedArray(LazilyIndexedArray(self.d))) + data=LazilyOuterIndexedArray(LazilyOuterIndexedArray(self.d))) self.check_orthogonal_indexing(v) # hierarchical wrapping v = Variable(dims=('x', 'y'), - data=LazilyIndexedArray(NumpyIndexingAdapter(self.d))) + data=LazilyOuterIndexedArray(NumpyIndexingAdapter(self.d))) self.check_orthogonal_indexing(v) def test_CopyOnWriteArray(self): @@ -1980,7 +1980,7 @@ def test_CopyOnWriteArray(self): self.check_vectorized_indexing(v) # doubly wrapping v = Variable(dims=('x', 'y'), - data=CopyOnWriteArray(LazilyIndexedArray(self.d))) + data=CopyOnWriteArray(LazilyOuterIndexedArray(self.d))) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) From 4fccdeece6e38a0dce7c9bf79c396302edf3e04a Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 2 Mar 2018 23:28:41 +0900 Subject: [PATCH 31/32] flake8 --- xarray/backends/netCDF4_.py | 3 ++- xarray/core/indexing.py | 2 +- xarray/tests/test_dataset.py | 3 ++- xarray/tests/test_variable.py | 23 +++++++++++++---------- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 666ec84d875..4903e9a98f2 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -279,7 +279,8 @@ def open(cls, filename, mode='r', format='NETCDF4', group=None, def open_store_variable(self, name, var): with self.ensure_open(autoclose=False): dimensions = var.dimensions - data = indexing.LazilyOuterIndexedArray(NetCDF4ArrayWrapper(name, self)) + data = indexing.LazilyOuterIndexedArray( + NetCDF4ArrayWrapper(name, self)) attributes = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()) _ensure_fill_value_valid(data, attributes) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f65586b720c..d5099b5bc16 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -541,7 +541,7 @@ def _updated_key(self, new_key): return _combine_indexers(self.key, self.shape, new_key) def __getitem__(self, indexer): - # If the indexed array becomes a scalar, return LazilyOuterIndexedArray. + # If the indexed array becomes a scalar, return LazilyOuterIndexedArray if all(isinstance(ind, integer_types) for ind in indexer.tuple): key = BasicIndexer(tuple(k[indexer.tuple] for k in self.key.tuple)) return LazilyOuterIndexedArray(self.array, key) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 99ca51536f1..e3e2934a4fd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -77,7 +77,8 @@ def get_variables(self): def lazy_inaccessible(k, v): if k in self._indexvars: return v - data = indexing.LazilyOuterIndexedArray(InaccessibleArray(v.values)) + data = indexing.LazilyOuterIndexedArray( + InaccessibleArray(v.values)) return Variable(v.dims, data, v.attrs) return dict((k, lazy_inaccessible(k, v)) for k, v in iteritems(self._variables)) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 2f5c617e75b..c4489f50246 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -16,9 +16,9 @@ from xarray.core import indexing from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( - BasicIndexer, CopyOnWriteArray, DaskIndexingAdapter, LazilyOuterIndexedArray, - MemoryCachedArray, NumpyIndexingAdapter, OuterIndexer, PandasIndexAdapter, - VectorizedIndexer) + BasicIndexer, CopyOnWriteArray, DaskIndexingAdapter, + LazilyOuterIndexedArray, MemoryCachedArray, NumpyIndexingAdapter, + OuterIndexer, PandasIndexAdapter, VectorizedIndexer) from xarray.core.pycompat import PY3, OrderedDict from xarray.core.utils import NDArrayMixin from xarray.core.variable import as_compatible_data, as_variable @@ -1798,7 +1798,7 @@ def test_rolling_window(self): class TestAsCompatibleData(TestCase): def test_unchanged_types(self): - types = (np.asarray, PandasIndexAdapter, indexing.LazilyOuterIndexedArray) + types = (np.asarray, PandasIndexAdapter, LazilyOuterIndexedArray) for t in types: for data in [np.arange(3), pd.date_range('2000-01-01', periods=3), @@ -1966,12 +1966,14 @@ def test_LazilyOuterIndexedArray(self): self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping - v = Variable(dims=('x', 'y'), - data=LazilyOuterIndexedArray(LazilyOuterIndexedArray(self.d))) + v = Variable( + dims=('x', 'y'), + data=LazilyOuterIndexedArray(LazilyOuterIndexedArray(self.d))) self.check_orthogonal_indexing(v) # hierarchical wrapping - v = Variable(dims=('x', 'y'), - data=LazilyOuterIndexedArray(NumpyIndexingAdapter(self.d))) + v = Variable( + dims=('x', 'y'), + data=LazilyOuterIndexedArray(NumpyIndexingAdapter(self.d))) self.check_orthogonal_indexing(v) def test_CopyOnWriteArray(self): @@ -1979,8 +1981,9 @@ def test_CopyOnWriteArray(self): self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping - v = Variable(dims=('x', 'y'), - data=CopyOnWriteArray(LazilyOuterIndexedArray(self.d))) + v = Variable( + dims=('x', 'y'), + data=CopyOnWriteArray(LazilyOuterIndexedArray(self.d))) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) From 8e967105194d7b4208bcac22127cd0cb01a7a484 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 3 Mar 2018 14:25:13 +0900 Subject: [PATCH 32/32] Added transpose to LazilyOuterIndexedArray --- doc/whats-new.rst | 3 +-- xarray/core/indexing.py | 15 ++++++++++++++- xarray/core/variable.py | 3 ++- xarray/tests/test_indexing.py | 13 ++++++++++++- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c769eb398e3..0fad6882440 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -40,10 +40,9 @@ Enhancements - Support lazy vectorized-indexing. After this change, flexible indexing such as orthogonal/vectorized indexing, becomes possible for all the backend - arrays. (:issue:`1897`) + arrays. Also, lazy ``transpose`` is now also supported. (:issue:`1897`) By `Keisuke Fujii `_. - - Improve :py:func:`~xarray.DataArray.rolling` logic. :py:func:`~xarray.DataArrayRolling` object now supports :py:func:`~xarray.DataArrayRolling.construct` method that returns a view diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d5099b5bc16..bd16618d766 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -441,7 +441,7 @@ def __getitem__(self, key): class LazilyOuterIndexedArray(ExplicitlyIndexedNDArrayMixin): - """Wrap an array to make basic and orthogonal indexing lazy. + """Wrap an array to make basic and outer indexing lazy. """ def __init__(self, array, key=None): @@ -493,6 +493,10 @@ def __array__(self, dtype=None): array = as_indexable(self.array) return np.asarray(array[self.key], dtype=None) + def transpose(self, order): + return LazilyVectorizedIndexedArray( + self.array, self.key).transpose(order) + def __getitem__(self, indexer): if isinstance(indexer, VectorizedIndexer): array = LazilyVectorizedIndexedArray(self.array, self.key) @@ -1117,6 +1121,9 @@ def _indexing_array_and_key(self, key): return array, key + def transpose(self, order): + return self.array.transpose(order) + def __getitem__(self, key): array, key = self._indexing_array_and_key(key) return self._ensure_ndarray(array[key]) @@ -1160,6 +1167,9 @@ def __setitem__(self, key, value): 'into memory explicitly using the .load() ' 'method or accessing its .values attribute.') + def transpose(self, order): + return self.array.transpose(order) + class PandasIndexAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" @@ -1234,6 +1244,9 @@ def __getitem__(self, indexer): return result + def transpose(self, order): + return self.array # self.array should be always one-dimensional + def __repr__(self): return ('%s(array=%r, dtype=%r)' % (type(self).__name__, self.array, self.dtype)) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bd9229c1ee3..66a4e781161 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1053,7 +1053,8 @@ def transpose(self, *dims): axes = self.get_axis_num(dims) if len(dims) < 2: # no need to transpose if only one dimension return self.copy(deep=False) - data = duck_array_ops.transpose(self.data, axes) + + data = as_indexable(self._data).transpose(axes) return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 8a3e2ccbc99..0d1045d35c0 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -187,10 +187,21 @@ def test_lazily_indexed_array(self): indexers = [(3, 2), (I[:], 0), (I[:2], -1), (I[:4], [0]), ([4, 5], 0), ([0, 1, 2], [0, 1]), ([0, 3, 5], I[:2])] for i, j in indexers: - expected = np.asarray(v[i][j]) + expected = v[i][j] actual = v_lazy[i][j] assert expected.shape == actual.shape assert_array_equal(expected, actual) + + # test transpose + if actual.ndim > 1: + order = np.random.choice(actual.ndim, actual.ndim) + order = np.array(actual.dims) + transposed = actual.transpose(*order) + assert_array_equal(expected.transpose(*order), transposed) + assert isinstance( + actual._data, (indexing.LazilyVectorizedIndexedArray, + indexing.LazilyOuterIndexedArray)) + assert isinstance(actual._data, indexing.LazilyOuterIndexedArray) assert isinstance(actual._data.array, indexing.NumpyIndexingAdapter)