-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Multi-index indexing #802
Multi-index indexing #802
Changes from all commits
33ee76f
6695e85
647967a
de03fc9
f88cd42
8b17a79
881da7f
b786793
3b6d36d
0a5a7cf
8a9d488
fd26dce
dabb2ce
5f7d670
555f06e
8895e04
e900e9a
d73cff5
31b2c50
6e17a85
934beef
03c21bd
030ee25
712497c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,24 +86,19 @@ def __init__(self, data_array): | |
self.data_array = data_array | ||
|
||
def _remap_key(self, key): | ||
def lookup_positions(dim, labels): | ||
index = self.data_array.indexes[dim] | ||
return indexing.convert_label_indexer(index, labels) | ||
|
||
if utils.is_dict_like(key): | ||
return dict((dim, lookup_positions(dim, labels)) | ||
for dim, labels in iteritems(key)) | ||
else: | ||
if not utils.is_dict_like(key): | ||
# expand the indexer so we can handle Ellipsis | ||
key = indexing.expanded_indexer(key, self.data_array.ndim) | ||
return tuple(lookup_positions(dim, labels) for dim, labels | ||
in zip(self.data_array.dims, key)) | ||
labels = indexing.expanded_indexer(key, self.data_array.ndim) | ||
key = dict(zip(self.data_array.dims, labels)) | ||
return indexing.remap_label_indexers(self.data_array, key) | ||
|
||
def __getitem__(self, key): | ||
return self.data_array[self._remap_key(key)] | ||
pos_indexers, new_indexes = self._remap_key(key) | ||
return self.data_array[pos_indexers]._replace_indexes(new_indexes) | ||
|
||
def __setitem__(self, key, value): | ||
self.data_array[self._remap_key(key)] = value | ||
pos_indexers, _ = self._remap_key(key) | ||
self.data_array[pos_indexers] = value | ||
|
||
|
||
class _ThisArray(object): | ||
|
@@ -244,6 +239,23 @@ def _replace_maybe_drop_dims(self, variable, name=__default): | |
if set(v.dims) <= allowed_dims) | ||
return self._replace(variable, coords, name) | ||
|
||
def _replace_indexes(self, indexes): | ||
if not len(indexes): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just use |
||
return self | ||
coords = self._coords.copy() | ||
for name, idx in indexes.items(): | ||
coords[name] = Coordinate(name, idx) | ||
obj = self._replace(coords=coords) | ||
|
||
# switch from dimension to level names, if necessary | ||
dim_names = {} | ||
for dim, idx in indexes.items(): | ||
if not isinstance(idx, pd.MultiIndex) and idx.name != dim: | ||
dim_names[dim] = idx.name | ||
if dim_names: | ||
obj = obj.rename(dim_names) | ||
return obj | ||
|
||
__this_array = _ThisArray() | ||
|
||
def _to_temp_dataset(self): | ||
|
@@ -599,8 +611,10 @@ def sel(self, method=None, tolerance=None, **indexers): | |
Dataset.sel | ||
DataArray.isel | ||
""" | ||
return self.isel(**indexing.remap_label_indexers( | ||
self, indexers, method=method, tolerance=tolerance)) | ||
pos_indexers, new_indexes = indexing.remap_label_indexers( | ||
self, indexers, method=method, tolerance=tolerance | ||
) | ||
return self.isel(**pos_indexers)._replace_indexes(new_indexes) | ||
|
||
def isel_points(self, dim='points', **indexers): | ||
"""Return a new DataArray whose dataset is given by pointwise integer | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -419,6 +419,23 @@ def _replace_vars_and_dims(self, variables, coord_names=None, | |
obj = self._construct_direct(variables, coord_names, dims, attrs) | ||
return obj | ||
|
||
def _replace_indexes(self, indexes): | ||
if not len(indexes): | ||
return self | ||
variables = self._variables.copy() | ||
for name, idx in indexes.items(): | ||
variables[name] = Coordinate(name, idx) | ||
obj = self._replace_vars_and_dims(variables) | ||
|
||
# switch from dimension to level names, if necessary | ||
dim_names = {} | ||
for dim, idx in indexes.items(): | ||
if not isinstance(idx, pd.MultiIndex) and idx.name != dim: | ||
dim_names[dim] = idx.name | ||
if dim_names: | ||
obj = obj.rename(dim_names) | ||
return obj | ||
|
||
def copy(self, deep=False): | ||
"""Returns a copy of this dataset. | ||
|
||
|
@@ -954,7 +971,9 @@ def sel(self, method=None, tolerance=None, **indexers): | |
Requires pandas>=0.17. | ||
**indexers : {dim: indexer, ...} | ||
Keyword arguments with names matching dimensions and values given | ||
by scalars, slices or arrays of tick labels. | ||
by scalars, slices or arrays of tick labels. For dimensions with | ||
multi-index, the indexer may also be a dict-like object with keys | ||
matching index level names. | ||
|
||
Returns | ||
------- | ||
|
@@ -972,8 +991,10 @@ def sel(self, method=None, tolerance=None, **indexers): | |
Dataset.isel_points | ||
DataArray.sel | ||
""" | ||
return self.isel(**indexing.remap_label_indexers( | ||
self, indexers, method=method, tolerance=tolerance)) | ||
pos_indexers, new_indexes = indexing.remap_label_indexers( | ||
self, indexers, method=method, tolerance=tolerance | ||
) | ||
return self.isel(**pos_indexers)._replace_indexes(new_indexes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this handle the case where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nevermind, that can't happen. |
||
|
||
def isel_points(self, dim='points', **indexers): | ||
"""Returns a new dataset with each array indexed pointwise along the | ||
|
@@ -1114,8 +1135,9 @@ def sel_points(self, dim='points', method=None, tolerance=None, | |
Dataset.isel_points | ||
DataArray.sel_points | ||
""" | ||
pos_indexers = indexing.remap_label_indexers( | ||
self, indexers, method=method, tolerance=tolerance) | ||
pos_indexers, _ = indexing.remap_label_indexers( | ||
self, indexers, method=method, tolerance=tolerance | ||
) | ||
return self.isel_points(dim=dim, **pos_indexers) | ||
|
||
def reindex_like(self, other, method=None, tolerance=None, copy=True): | ||
|
@@ -1396,9 +1418,6 @@ def unstack(self, dim): | |
obj = self.reindex(copy=False, **{dim: full_idx}) | ||
|
||
new_dim_names = index.names | ||
if any(name is None for name in new_dim_names): | ||
raise ValueError('cannot unstack dimension with unnamed levels') | ||
|
||
new_dim_sizes = [lev.size for lev in index.levels] | ||
|
||
variables = OrderedDict() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
|
||
from . import utils | ||
from .pycompat import iteritems, range, dask_array_type, suppress | ||
from .utils import is_full_slice | ||
from .utils import is_full_slice, is_dict_like | ||
|
||
|
||
def expanded_indexer(key, ndim): | ||
|
@@ -135,11 +135,18 @@ def _asarray_tuplesafe(values): | |
return result | ||
|
||
|
||
def _is_nested_tuple(possible_tuple): | ||
return (isinstance(possible_tuple, tuple) | ||
and any(isinstance(value, (tuple, list, slice)) | ||
for value in possible_tuple)) | ||
|
||
|
||
def convert_label_indexer(index, label, index_name='', method=None, | ||
tolerance=None): | ||
"""Given a pandas.Index and labels (e.g., from __getitem__) for one | ||
dimension, return an indexer suitable for indexing an ndarray along that | ||
dimension | ||
dimension. If `index` is a pandas.MultiIndex and depending on `label`, | ||
return a new pandas.Index or pandas.MultiIndex (otherwise return None). | ||
""" | ||
# backwards compatibility for pandas<0.16 (method) or pandas<0.17 | ||
# (tolerance) | ||
|
@@ -152,6 +159,8 @@ def convert_label_indexer(index, label, index_name='', method=None, | |
'the tolerance argument requires pandas v0.17 or newer') | ||
kwargs['tolerance'] = tolerance | ||
|
||
new_index = None | ||
|
||
if isinstance(label, slice): | ||
if method is not None or tolerance is not None: | ||
raise NotImplementedError( | ||
|
@@ -166,29 +175,63 @@ def convert_label_indexer(index, label, index_name='', method=None, | |
raise KeyError('cannot represent labeled-based slice indexer for ' | ||
'dimension %r with a slice over integer positions; ' | ||
'the index is unsorted or non-unique') | ||
|
||
elif is_dict_like(label): | ||
is_nested_vals = _is_nested_tuple(tuple(label.values())) | ||
if not isinstance(index, pd.MultiIndex): | ||
raise ValueError('cannot use a dict-like object for selection on a ' | ||
'dimension that does not have a MultiIndex') | ||
elif len(label) == index.nlevels and not is_nested_vals: | ||
indexer = index.get_loc(tuple((label[k] for k in index.names))) | ||
else: | ||
indexer, new_index = index.get_loc_level(tuple(label.values()), | ||
level=tuple(label.keys())) | ||
|
||
elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex): | ||
if _is_nested_tuple(label): | ||
indexer = index.get_locs(label) | ||
elif len(label) == index.nlevels: | ||
indexer = index.get_loc(label) | ||
else: | ||
indexer, new_index = index.get_loc_level( | ||
label, level=list(range(len(label))) | ||
) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we could reproduce what pandas does in terms of collapsing tuple levels if we call # untested!
elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex):
if _is_nested_tuple(label):
indexer = index.get_locs(label)
else:
indexer, new_index = index.get_loc_level(label, level=range(len(label))) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it works! However, using non-nested tuples here consists of selecting single elements and raises the question of how we handle returned scalar values. In that specific case we should drop the dimension but keep the 0-d (multi-level) coordinate so that More generally, I think we definitely need to carefully address level drop in all cases. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doing some tests, it seems like Good:
Bad:
Good:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I guess we need to check the length of the tuple (probably also in the elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex):
if _is_nested_tuple(label):
indexer = index.get_locs(label)
elif len(label) == index.nlevels:
indexer = index.get_loc(label)
else:
indexer, new_index = index.get_loc_level(label, level=range(len(label))) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (EDIT: forget about this comment, it is complete nonsense :) ) def _maybe_drop_levels(index):
drop_levels = [i for i, lab in enumerate(index.labels)
if not np.ptp(lab.values())]
if len(drop_levels) < len(index.labels):
return index.droplevel(drop_levels)
else:
return index
def convert_label_indexer(...):
# ...
if isinstance(new_index, pd.MultiIndex):
new_index = _maybe_drop_levels(new_index)
return indexer, new_index There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The advantage of doing something like my proposed logic (which I think is similar to what pandas does) is that whether a level is dropped depends only on the indexer type and the number of multi-index levels, as opposed to dropping levels in a way that depends also on the particular values in the indexer and index. Code that depends only on type information rather than values is generally easier to understand and less error prone. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, I used Anyway, I get your logic. It is also much more efficient! |
||
else: | ||
label = _asarray_tuplesafe(label) | ||
if label.ndim == 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is where scalars end up -- probably need to add a clause here to handle MultiIndex |
||
indexer = index.get_loc(label.item(), **kwargs) | ||
if isinstance(index, pd.MultiIndex): | ||
indexer, new_index = index.get_loc_level(label.item(), level=0) | ||
else: | ||
indexer = index.get_loc(label.item(), **kwargs) | ||
elif label.dtype.kind == 'b': | ||
indexer, = np.nonzero(label) | ||
else: | ||
indexer = index.get_indexer(label, **kwargs) | ||
if np.any(indexer < 0): | ||
raise KeyError('not all values found in index %r' | ||
% index_name) | ||
return indexer | ||
return indexer, new_index | ||
|
||
|
||
def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): | ||
"""Given an xarray data object and label based indexers, return a mapping | ||
of equivalent location based indexers. | ||
of equivalent location based indexers. Also return a mapping of updated | ||
pandas index objects (in case of multi-index level drop). | ||
""" | ||
if method is not None and not isinstance(method, str): | ||
raise TypeError('``method`` must be a string') | ||
return dict((dim, convert_label_indexer(data_obj[dim].to_index(), label, | ||
dim, method, tolerance)) | ||
for dim, label in iteritems(indexers)) | ||
|
||
pos_indexers, new_indexes = {}, {} | ||
for dim, label in iteritems(indexers): | ||
index = data_obj[dim].to_index() | ||
idxr, new_idx = convert_label_indexer(index, label, | ||
dim, method, tolerance) | ||
pos_indexers[dim] = idxr | ||
if new_idx is not None: | ||
new_indexes[dim] = new_idx | ||
|
||
return pos_indexers, new_indexes | ||
|
||
|
||
def slice_slice(old_slice, applied_slice, size): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1096,7 +1096,13 @@ def to_index(self): | |
# basically free as pandas.Index objects are immutable | ||
assert self.ndim == 1 | ||
index = self._data_cached().array | ||
if not isinstance(index, pd.MultiIndex): | ||
if isinstance(index, pd.MultiIndex): | ||
# set default names for multi-index unnamed levels so that | ||
# we can safely rename dimension / coordinate later | ||
valid_level_names = [name or '{}_level_{}'.format(self.name, i) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would lean towards defaulting to simply "level_0", etc., but I don't feel too strongly here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it makes sense and it is more compact. However, how do we handle cases where two or more multi-index dimensions both collapse to I let you decide. |
||
for i, name in enumerate(index.names)] | ||
index = index.set_names(valid_level_names) | ||
else: | ||
index = index.set_names(self.name) | ||
return index | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might make sense simply to drop this paragraph instead -- do we really need to explicitly call out MultiIndex if it's supported?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No I don't think we need it. However, it might be good to put a sentence somewhere in the docs to recommend users to set names for multi-index levels before creating data arrays or datasets. What do you think?