From eb705062b1cd297744db917e5fb39436814cc789 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 4 Nov 2016 20:40:53 -0700 Subject: [PATCH 1/6] Indexes are now optional --- doc/api.rst | 2 + doc/combining.rst | 1 + doc/computation.rst | 17 +- doc/data-structures.rst | 56 +++--- doc/examples/quick-overview.rst | 50 ++++-- doc/indexing.rst | 34 +++- doc/whats-new.rst | 32 +++- xarray/backends/common.py | 25 --- xarray/conventions.py | 4 +- xarray/core/alignment.py | 172 +++++++++++++----- xarray/core/common.py | 18 +- xarray/core/coordinates.py | 84 ++++----- xarray/core/dataarray.py | 66 +++---- xarray/core/dataset.py | 96 +++++----- xarray/core/groupby.py | 266 +++++++++++++++------------ xarray/core/indexing.py | 103 +++++------ xarray/core/merge.py | 11 +- xarray/core/rolling.py | 6 +- xarray/core/variable.py | 11 +- xarray/plot/plot.py | 5 +- xarray/plot/utils.py | 2 +- xarray/test/test_backends.py | 3 +- xarray/test/test_combine.py | 61 ++++--- xarray/test/test_conventions.py | 5 +- xarray/test/test_dask.py | 16 +- xarray/test/test_dataarray.py | 309 +++++++++++++++++++++----------- xarray/test/test_dataset.py | 279 ++++++++++++++++------------ xarray/test/test_groupby.py | 7 + xarray/test/test_merge.py | 13 +- xarray/test/test_plot.py | 6 +- 30 files changed, 1048 insertions(+), 712 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 2602d9f2e29..cac4d9d4496 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -44,6 +44,7 @@ Attributes Dataset.coords Dataset.attrs Dataset.indexes + Dataset.get_index Dictionary interface -------------------- @@ -193,6 +194,7 @@ Attributes DataArray.attrs DataArray.encoding DataArray.indexes + DataArray.get_index **ndarray attributes**: :py:attr:`~DataArray.ndim` diff --git a/doc/combining.rst b/doc/combining.rst index b0fbeff0d4b..e16ff045b4c 100644 --- a/doc/combining.rst +++ b/doc/combining.rst @@ -205,6 +205,7 @@ have ``NaN`` values. This can be used to combine data with overlapping coordinates as long as any non-missing values agree or are disjoint: .. ipython:: python + ds1 = xr.Dataset({'a': ('x', [10, 20, 30, np.nan])}, {'x': [1, 2, 3, 4]}) ds2 = xr.Dataset({'a': ('x', [np.nan, 30, 40, 50])}, {'x': [2, 3, 4, 5]}) xr.merge([ds1, ds2], compat='no_conflicts') diff --git a/doc/computation.rst b/doc/computation.rst index 9cba5db2061..229a524ed4e 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -196,7 +196,9 @@ This means, for example, that you always subtract an array from its transpose: You can explicitly broadcast xaray data structures by using the :py:func:`~xarray.broadcast` function: - a2, b2 = xr.broadcast(a, b2) +.. ipython:: python + + a2, b2 = xr.broadcast(a, b) a2 b2 @@ -215,15 +217,18 @@ operations. The default result of a binary operation is by the *intersection* .. ipython:: python - arr + arr[:1] + arr = xr.DataArray(np.arange(3), [('x', range(3))]) + arr + arr[:-1] -If the result would be empty, an error is raised instead: +If coordinate values for a dimension are missing on either argument, all +matching dimensions must have the same size: -.. ipython:: +.. ipython:: python @verbatim - In [1]: arr[:2] + arr[2:] - ValueError: no overlapping labels for some dimensions: ['x'] + In [1]: arr + xr.DataArray([1, 2], dims='x') + ValueError: arguments without labels along dimension 'x' cannot be aligned because they have different dimension size(s) {2} than the size of the aligned dimension labels: 3 + However, one can explicitly change this default automatic alignment type ("inner") via :py:func:`~xarray.set_options()` in context manager: diff --git a/doc/data-structures.rst b/doc/data-structures.rst index 947c09db7d6..7345675c2b3 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -67,18 +67,33 @@ in with default values: xr.DataArray(data) -As you can see, dimensions and coordinate arrays corresponding to each -dimension are always present. This behavior is similar to pandas, which fills -in index values in the same way. +As you can see, dimension names are always present in the xarray data model: if +you do not provide them, defaults of the form ``dim_N`` will be created. + +.. note:: + + Prior to xarray v0.9, coordinates corresponding to dimension were *also* + always present in xarray: xarray would create default coordinates of the form + ``range(dim_size)`` if coordinates were not supplied explicitly. This is no + longer the case. Coordinates can take the following forms: -- A list of ``(dim, ticks[, attrs])`` pairs with length equal to the number of dimensions -- A dictionary of ``{coord_name: coord}`` where the values are each a scalar value, - a 1D array or a tuple. Tuples are be in the same form as the above, and - multiple dimensions can be supplied with the form ``(dims, data[, attrs])``. - Supplying as a tuple allows other coordinates than those corresponding to - dimensions (more on these later). +- A list of values with length equal to the number of dimensions, providing + coordinate labels for each dimension. Each value must be of one of the + following forms: + + * A :py:class:`~xarray.DataArray` or :py:class:`~xarray.Variable` + * A tuple of the form ``(dims, data[, attrs])``, which is converted into + arguments for :py:class:`~xarray.Variable` + * A pandas object or scalar value, which is converted into a ``DataArray`` + * A 1D array or list, which is interpreted as values for a one dimensional + coordinate variable along the same dimension as it's name + +- A dictionary of ``{coord_name: coord}`` where values are of the same form + as the list. Supplying coordinates as a dictionary allows other coordinates + than those corresponding to dimensions (more on these later). If you supply + ``coords`` as a dictionary, you must explicitly provide ``dims``. As a list of tuples: @@ -128,7 +143,7 @@ Let's take a look at the important properties on our array: foo.attrs print(foo.name) -You can even modify ``values`` inplace: +You can modify ``values`` inplace: .. ipython:: python @@ -228,14 +243,19 @@ Creating a Dataset To make an :py:class:`~xarray.Dataset` from scratch, supply dictionaries for any variables (``data_vars``), coordinates (``coords``) and attributes (``attrs``). -``data_vars`` are supplied as a dictionary with each key as the name of the variable and each +- ``data_vars`` should be a dictionary with each key as the name of the variable and each value as one of: -- A :py:class:`~xarray.DataArray` -- A tuple of the form ``(dims, data[, attrs])`` -- A pandas object -``coords`` are supplied as dictionary of ``{coord_name: coord}`` where the values are scalar values, -arrays or tuples in the form of ``(dims, data[, attrs])``. + * A :py:class:`~xarray.DataArray` or :py:class:`~xarray.Variable` + * A tuple of the form ``(dims, data[, attrs])``, which is converted into + arguments for :py:class:`~xarray.Variable` + * A pandas object, which is converted into a ``DataArray`` + * A 1D array or list, which is interpreted as values for a one dimensional + coordinate variable along the same dimension as it's name + +- ``coords`` should be a dictionary of the same form as ``data_vars``. + +- ``attrs`` should be a dictionary. Let's create some fake data for the example we show above: @@ -256,10 +276,6 @@ Let's create some fake data for the example we show above: 'reference_time': pd.Timestamp('2014-09-05')}) ds -Notice that we did not explicitly include coordinates for the "x" or "y" -dimensions, so they were filled in array of ascending integers of the proper -length. - Here we pass :py:class:`xarray.DataArray` objects or a pandas object as values in the dictionary: diff --git a/doc/examples/quick-overview.rst b/doc/examples/quick-overview.rst index 3b476cdf1c2..aa0381444e1 100644 --- a/doc/examples/quick-overview.rst +++ b/doc/examples/quick-overview.rst @@ -23,7 +23,7 @@ array or list, with optional *dimensions* and *coordinates*: .. ipython:: python xr.DataArray(np.random.randn(2, 3)) - data = xr.DataArray(np.random.randn(2, 3), [('x', ['a', 'b']), ('y', [-2, 0, 2])]) + data = xr.DataArray(np.random.randn(2, 3), coords={'x': ['a', 'b']}, dims=('x', 'y')) data If you supply a pandas :py:class:`~pandas.Series` or @@ -121,31 +121,55 @@ xarray supports grouped operations using a very similar API to pandas: data.groupby(labels).mean('y') data.groupby(labels).apply(lambda x: x - x.min()) -Convert to pandas ------------------ +pandas +------ -A key feature of xarray is robust conversion to and from pandas objects: +Xarray objects can be easily converted to and from pandas objects: .. ipython:: python - data.to_series() - data.to_pandas() + series = data.to_series() + series -Datasets and NetCDF -------------------- + # convert back + series.to_xarray() -:py:class:`xarray.Dataset` is a dict-like container of ``DataArray`` objects that share -index labels and dimensions. It looks a lot like a netCDF file: +Datasets +-------- + +:py:class:`xarray.Dataset` is a dict-like container of aligned ``DataArray`` +objects. You can think of it as a multi-dimensional generalization of the +:py:class:`pandas.DataFrame`: .. ipython:: python - ds = data.to_dataset(name='foo') + ds = xr.Dataset({'foo': data, 'bar': ('x', [1, 2]), 'baz': np.pi}) ds +Use dictionary indexing to pull out ``Dataset`` variables as ``DataArray`` +objects: + +.. ipython:: python + + ds['foo'] + +Variables in datasets can have different ``dtype`` and even different +dimensions, but all dimensions are assumed to refer to points in the same shared +coordinate system. + You can do almost everything you can do with ``DataArray`` objects with -``Dataset`` objects if you prefer to work with multiple variables at once. +``Dataset`` objects (including indexing and arithmetic) if you prefer to work +with multiple variables at once. + +NetCDF +------ + +NetCDF is the recommended binary serialization format for xarray objects. Users +from the geosciences will recognize that the :py:class:`~xarray.Dataset` data +model looks very similar to a netCDF file (which, in fact, inspired it). -Datasets also let you easily read and write netCDF files: +You can directly read and write xarray objects to disk using :py:meth:`~xarray.Dataset.to_netcdf`, :py:func:`~xarray.open_dataset` and +:py:func:`~xarray.open_dataarray`: .. ipython:: python diff --git a/doc/indexing.rst b/doc/indexing.rst index acf6920c4f6..378a04b3942 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -221,7 +221,7 @@ enabling nearest neighbor (inexact) lookups by use of the methods ``'pad'``, .. ipython:: python - data = xr.DataArray([1, 2, 3], dims='x') + data = xr.DataArray([1, 2, 3], [('x', [0, 1, 2])]) data.sel(x=[1.1, 1.9], method='nearest') data.sel(x=0.1, method='backfill') data.reindex(x=[0.5, 1, 1.5, 2, 2.5], method='pad') @@ -478,6 +478,30 @@ Both ``reindex_like`` and ``align`` work interchangeably between # this is a no-op, because there are no shared dimension names ds.reindex_like(other) +.. _indexing.missing_coordinates: + +Missing coordinate labels +------------------------- + +Coordinate labels for each dimension are optional (as of xarray v0.9). Label +based indexing with ``.sel`` and ``.loc`` uses standard positional, +integer-based indexing as a fallback for dimensions without a coordinate label: + +.. ipython:: python + + array = xr.DataArray([1, 2, 3], dims='x') + array.sel(x=[0, -1]) + +Alignment between xarray objects where one or both do not have coordinate labels +succeeds only if all dimensions of the same name have the same length. +Otherwise, it raises an informative error: + +.. ipython:: + :verbatim: + + In [62]: xr.align(array, array[:2]) + ValueError: arguments without labels along dimension 'x' cannot be aligned because they have different dimension sizes: {2, 3} + Underlying Indexes ------------------ @@ -491,3 +515,11 @@ through the :py:attr:`~xarray.DataArray.indexes` attribute. arr.indexes arr.indexes['time'] +Use :py:meth:`~xarray.DataArray.get_index` to get an index for a dimension, +falling back to a default :py:class:`pandas.RangeIndex` if it has no coordinate +labels: + +.. ipython:: python + + array + array.get_index('x') diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ce305a6aa24..3d79f856905 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,10 +21,31 @@ v0.9.0 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ +- Index coordinates for each dimensions are now optional, and no longer created + by default. This has a number of implications: + + - :py:func:`~align` and :py:meth:`~Dataset.reindex` can now error, if + dimensions labels are missing and dimensions have different sizes. + - Because pandas does not support missing indexes, methods such as + ``to_dataframe``/``from_dataframe`` and ``stack``/``unstack`` no longer + roundtrip faithfully on all inputs. Use :py:meth:`~Dataset.reset_index` to + remove undesired indexes. + - ``Dataset.__delitem__`` and :py:meth:`~Dataset.drop` no longer delete/drop + variables that have dimensions matching a deleted/dropped variable. + - ``DataArray.coords.__delitem__`` is now allowed on variables matching + dimension names. + - ``.sel`` and ``.loc`` now handle indexing along a dimension without + coordinate labels by doing integer based indexing. See + :ref:`indexing.missing_coordinates` for an example. + - :py:attr:`~Dataset.indexes` is no longer guaranteed to include all + dimensions names as keys. The new method :py:meth:`~Dataset.get_index` has + been added to get an index for a dimension guaranteed, falling back to + produce a default ``RangeIndex`` if necessary. + - The default behavior of ``merge`` is now ``compat='no_conflicts'``, so some merges will now succeed in cases that previously raised ``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the - previous default. + previous default. See :ref:`combining.no_conflicts` for more details. Deprecations ~~~~~~~~~~~~ @@ -123,6 +144,13 @@ Bug fixes should be computed or not. By `Fabien Maussion `_. +- Grouping over an dimension with non-unique values with ``groupby`` gives + correct groups. + By `Stephan Hoyer `_. + +- Fixed accessing coordinate variables with non-string names from ``.coords``. + By `Stephan Hoyer `_. + .. _whats-new.0.8.2: v0.8.2 (18 August 2016) @@ -1242,7 +1270,7 @@ Enhancements .. ipython:: python - data = xray.DataArray([1, 2, 3], dims='x') + data = xray.DataArray([1, 2, 3], [('x', range(3))]) data.reindex(x=[0.5, 1, 1.5, 2, 2.5], method='pad') This will be especially useful once pandas 0.16 is released, at which point diff --git a/xarray/backends/common.py b/xarray/backends/common.py index bf85930c8df..9e9648db730 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -33,25 +33,6 @@ def _decode_variable_name(name): return name -def is_trivial_index(var): - """ - Determines if in index is 'trivial' meaning that it is - equivalent to np.arange(). This is determined by - checking if there are any attributes or encodings, - if ndims is one, dtype is int and finally by comparing - the actual values to np.arange() - """ - # if either attributes or encodings are defined - # the index is not trivial. - if len(var.attrs) or len(var.encoding): - return False - # if the index is not a 1d integer array - if var.ndim > 1 or not var.dtype.kind == 'i': - return False - arange = np.arange(var.size, dtype=var.dtype) - return np.all(var.values == arange) - - def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500): """ @@ -203,12 +184,6 @@ def store_dataset(self, dataset): def store(self, variables, attributes, check_encoding_set=frozenset()): self.set_attributes(attributes) - neccesary_dims = [v.dims for v in variables.values()] - neccesary_dims = set(itertools.chain(*neccesary_dims)) - # set all non-indexes and any index which is not trivial. - variables = OrderedDict((k, v) for k, v in iteritems(variables) - if not (k in neccesary_dims and - is_trivial_index(v))) self.set_variables(variables, check_encoding_set) def set_attributes(self, attributes): diff --git a/xarray/conventions.py b/xarray/conventions.py index ef7705eb6d7..9e332039bbc 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -913,7 +913,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, identify coordinates. drop_variables: string or iterable, optional A variable or list of variables to exclude from being parsed from the - dataset.This may be useful to drop variables with problems or + dataset. This may be useful to drop variables with problems or inconsistent values. Returns @@ -939,7 +939,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, vars, attrs, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables=drop_variables) ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.union(extra_coords)) + ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds._file_obj = file_obj return ds diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 53c47b35c10..912858e3c49 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -10,8 +10,9 @@ from . import ops, utils from .common import _maybe_promote -from .pycompat import iteritems, OrderedDict -from .utils import is_full_slice, is_dict_like +from .indexing import get_indexer +from .pycompat import iteritems, OrderedDict, suppress +from .utils import is_full_slice from .variable import Variable, IndexVariable @@ -32,10 +33,10 @@ def align(*objects, **kwargs): """align(*objects, join='inner', copy=True) Given any number of Dataset and/or DataArray objects, returns new - objects with aligned indexes. + objects with aligned indexes and dimension sizes. Array from the aligned objects are suitable as input to mathematical - operators, because along each dimension they have the same indexes. + operators, because along each dimension they have the same index and size. Missing values (if ``join != 'inner'``) are filled with NaN. @@ -65,6 +66,12 @@ def align(*objects, **kwargs): ------- aligned : same as *objects Tuple of objects with aligned coordinates. + + Raises + ------ + ValueError + If any dimensions without labels on the arguments have different sizes, + or a different size than the size of the aligned dimension labels. """ join = kwargs.pop('join', 'inner') copy = kwargs.pop('copy', True) @@ -79,48 +86,114 @@ def align(*objects, **kwargs): % list(kwargs)) all_indexes = defaultdict(list) + unlabeled_dim_sizes = defaultdict(set) for obj in objects: - for dim, index in iteritems(obj.indexes): + for dim in obj.dims: if dim not in exclude: - all_indexes[dim].append(index) - - # We don't join over dimensions with all equal indexes for two reasons: + try: + index = obj.indexes[dim] + except KeyError: + unlabeled_dim_sizes[dim].add(obj.sizes[dim]) + else: + all_indexes[dim].append(index) + + # We don't reindex over dimensions with all equal indexes for two reasons: # - It's faster for the usual case (already aligned objects). # - It ensures it's possible to do operations that don't require alignment # on indexes with duplicate values (which cannot be reindexed with # pandas). This is useful, e.g., for overwriting such duplicate indexes. joiner = _get_joiner(join) joined_indexes = {} - for dim, dim_indexes in iteritems(all_indexes): + for dim, matching_indexes in iteritems(all_indexes): if dim in indexes: index = utils.safe_cast_to_index(indexes[dim]) - if any(not index.equals(other) for other in dim_indexes): + if (any(not index.equals(other) for other in matching_indexes) or + dim in unlabeled_dim_sizes): joined_indexes[dim] = index else: - if any(not dim_indexes[0].equals(other) - for other in dim_indexes[1:]): - joined_indexes[dim] = joiner(dim_indexes) + if (any(not matching_indexes[0].equals(other) + for other in matching_indexes[1:]) or + dim in unlabeled_dim_sizes): + index = joiner(matching_indexes) + joined_indexes[dim] = index + else: + index = matching_indexes[0] + + if dim in unlabeled_dim_sizes: + unlabeled_sizes = unlabeled_dim_sizes[dim] + labeled_size = index.size + if len(unlabeled_sizes | {labeled_size}) > 1: + raise ValueError( + 'arguments without labels along dimension %r cannot be ' + 'aligned because they have different dimension size(s) %r ' + 'than the size of the aligned dimension labels: %r' + % (dim, unlabeled_sizes, labeled_size)) + + for dim in unlabeled_dim_sizes: + if dim not in all_indexes: + sizes = unlabeled_dim_sizes[dim] + if len(sizes) > 1: + raise ValueError( + 'arguments without labels along dimension %r cannot be ' + 'aligned because they have different dimension sizes: %r' + % (dim, sizes)) result = [] for obj in objects: - valid_indexers = dict((k, v) for k, v in joined_indexes.items() - if k in obj.dims) + valid_indexers = {k: v for k, v in joined_indexes.items() + if k in obj.dims} result.append(obj.reindex(copy=copy, **valid_indexers)) return tuple(result) -def reindex_variables(variables, indexes, indexers, method=None, +def reindex_like_indexers(target, other): + """Extract indexers to align target with other. + + Not public API. + + Parameters + ---------- + target : Dataset or DataArray + Object to be aligned. + other : Dataset or DataArray + Object to be aligned with. + + Returns + ------- + Dict[Any, pandas.Index] providing indexes for reindex keyword arguments. + + Raises + ------ + ValueError + If any dimensions without labels have different sizes. + """ + indexers = {k: v for k, v in other.indexes.items() if k in target.dims} + + for dim in other.dims: + if dim not in indexers and dim in target.dims: + other_size = other.sizes[dim] + target_size = target.sizes[dim] + if other_size != target_size: + raise ValueError('different size for unlabeled ' + 'dimension on argument %r: %r vs %r' + % (dim, other_size, target_size)) + return indexers + + +def reindex_variables(variables, sizes, indexes, indexers, method=None, tolerance=None, copy=True): """Conform a dictionary of aligned variables onto a new set of variables, filling in missing values with NaN. - WARNING: This method is not public API. Don't use it directly. + Not public API. Parameters ---------- variables : dict-like Dictionary of xarray.Variable objects. + sizes : dict-like + Dictionary from dimension names to integer sizes. indexes : dict-like Dictionary of xarray.IndexVariable objects associated with variables. indexers : dict @@ -150,31 +223,22 @@ def reindex_variables(variables, indexes, indexers, method=None, reindexed : OrderedDict Another dict, with the items in variables but replaced indexes. """ - # build up indexers for assignment along each index + # build up indexers for assignment along each dimension to_indexers = {} - to_shape = {} from_indexers = {} - - # for compat with older versions of pandas that don't support tolerance - get_indexer_kwargs = {} - if tolerance is not None: - if pd.__version__ < '0.17': - raise NotImplementedError( - 'the tolerance argument requires pandas v0.17 or newer') - get_indexer_kwargs['tolerance'] = tolerance + # size of reindexed dimensions + new_sizes = {} for name, index in iteritems(indexes): - to_shape[name] = index.size if name in indexers: target = utils.safe_cast_to_index(indexers[name]) if not index.is_unique: raise ValueError( 'cannot reindex or align along dimension %r because the ' 'index has duplicate values' % name) - indexer = index.get_indexer(target, method=method, - **get_indexer_kwargs) + indexer = get_indexer(index, target, method, tolerance) - to_shape[name] = len(target) + new_sizes[name] = len(target) # Note pandas uses negative values from get_indexer to signify # values that are missing in the index # The non-negative values thus indicate the non-missing values @@ -192,6 +256,16 @@ def reindex_variables(variables, indexes, indexers, method=None, # unnecessary copies from_indexers[name] = slice(None) + for dim in sizes: + if dim not in indexes and dim in indexers: + existing_size = sizes[dim] + new_size = utils.safe_cast_to_index(indexers[dim]).size + if existing_size != new_size: + raise ValueError( + 'cannot reindex or align along dimension %r without an ' + 'index because its size %r is different from the size of ' + 'the new index %r' % (dim, existing_size, new_size)) + def any_not_full_slices(indexers): return any(not is_full_slice(idx) for idx in indexers) @@ -200,12 +274,17 @@ def var_indexers(var, indexers): # create variables for the new dataset reindexed = OrderedDict() - for name, var in iteritems(variables): - if name in indexers: - # no need to copy, because index data is immutable - new_var = IndexVariable(var.dims, indexers[name], var.attrs, - var.encoding) + + for dim, indexer in indexers.items(): + if dim in variables: + var = variables[dim] + args = (var.attrs, var.encoding) else: + args = () + reindexed[dim] = IndexVariable((dim,), indexers[dim], *args) + + for name, var in iteritems(variables): + if name not in indexers: assign_to = var_indexers(var, to_indexers) assign_from = var_indexers(var, from_indexers) @@ -215,7 +294,8 @@ def var_indexers(var, indexers): dtype, fill_value = _maybe_promote(var.dtype) if isinstance(data, np.ndarray): - shape = tuple(to_shape[dim] for dim in var.dims) + shape = tuple(new_sizes.get(dim, size) + for dim, size in zip(var.dims, var.shape)) new_data = np.empty(shape, dtype=dtype) new_data[...] = fill_value # create a new Variable so we can use orthogonal indexing @@ -245,7 +325,7 @@ def var_indexers(var, indexers): # we neither created a new ndarray nor used fancy indexing new_var = var.copy(deep=copy) - reindexed[name] = new_var + reindexed[name] = new_var return reindexed @@ -272,11 +352,6 @@ def broadcast(*args, **kwargs): The same data as the input arrays, but with additional dimensions inserted so that all data arrays have the same dimensions and shape. - Raises - ------ - ValueError - If indexes on the different objects are not aligned. - Examples -------- @@ -343,18 +418,17 @@ def broadcast(*args, **kwargs): for arg in args: for dim in arg.dims: if dim not in common_coords and dim not in exclude: - common_coords[dim] = arg.coords[dim].variable - dims_map[dim] = common_coords[dim].size + dims_map[dim] = arg.sizes[dim] + if dim in arg.coords: + common_coords[dim] = arg.coords[dim].variable def _expand_dims(var): # Add excluded dims to a copy of dims_map var_dims_map = dims_map.copy() for dim in exclude: - try: + with suppress(ValueError): + # ignore dim not in var.dims var_dims_map[dim] = var.shape[var.dims.index(dim)] - except ValueError: - # dim not in var.dims - pass return var.expand_dims(var_dims_map) diff --git a/xarray/core/common.py b/xarray/core/common.py index 5ac9994ee8c..647707a3545 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -286,6 +286,17 @@ def squeeze(self, dim=None): class BaseDataObject(SharedMethodsMixin, AttrAccessMixin): """Shared base class for Dataset and DataArray.""" + def get_index(self, key): + """Get an index for a dimension, with fall-back to a default RangeIndex + """ + if key not in self.dims: + raise KeyError(key) + + try: + return self.indexes[key] + except KeyError: + return pd.Index(range(self.sizes[key]), name=key) + def _calc_assign_results(self, kwargs): results = SortedKeysDict() for k, v in kwargs.items(): @@ -408,8 +419,6 @@ def groupby(self, group, squeeze=True): A `GroupBy` object patterned after `pandas.GroupBy` that can be iterated over in the form of `(unique_value, grouped_array)` pairs. """ - if isinstance(group, basestring): - group = self[group] return self.groupby_cls(self, group, squeeze=squeeze) def groupby_bins(self, group, bins, right=True, labels=None, precision=3, @@ -459,8 +468,6 @@ def groupby_bins(self, group, bins, right=True, labels=None, precision=3, ---------- .. [1] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html """ - if isinstance(group, basestring): - group = self[group] return self.groupby_cls(self, group, squeeze=squeeze, bins=bins, cut_kwargs={'right': right, 'labels': labels, 'precision': precision, @@ -639,7 +646,8 @@ def where(self, cond, other=None, drop=False): clip = dict(zip(clipcond.dims, [np.unique(adim) for adim in np.nonzero(clipcond.values)])) outcond = cond.isel(**clip) - outobj = self.sel(**outcond.indexes) + indexers = {dim: outcond.get_index(dim) for dim in outcond.dims} + outobj = self.sel(**indexers) else: outobj = self outcond = cond diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 4aac9fdd67c..41027676aa4 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -8,19 +8,12 @@ from . import formatting from .utils import Frozen from .merge import merge_coords, merge_coords_without_align -from .pycompat import iteritems, basestring, OrderedDict -from .variable import default_index_coordinate +from .pycompat import OrderedDict class AbstractCoordinates(Mapping, formatting.ReprMixin): def __getitem__(self, key): - if (key in self._names or - (isinstance(key, basestring) and - key.split('.')[0] in self._names)): - # allow indexing current coordinates or components - return self._data[key] - else: - raise KeyError(key) + raise NotImplementedError def __setitem__(self, key, value): self.update({key: value}) @@ -60,7 +53,7 @@ def to_index(self, ordered_dims=None): """ if ordered_dims is None: ordered_dims = self.dims - indexes = [self.variables[k].to_index() for k in ordered_dims] + indexes = [self._data.get_index(k) for k in ordered_dims] return pd.MultiIndex.from_product(indexes, names=list(ordered_dims)) def update(self, other): @@ -146,6 +139,11 @@ def variables(self): for k, v in self._data.variables.items() if k in self._names)) + def __getitem__(self, key): + if key in self._data.data_vars: + raise KeyError(key) + return self._data[key] + def to_dataset(self): """Convert these coordinates into a new Dataset """ @@ -159,14 +157,13 @@ def _update_coords(self, coords): # check for inconsistent state *before* modifying anything in-place dims = calculate_dimensions(variables) + new_coord_names = set(coords) for dim, size in dims.items(): - if dim not in variables: - variables[dim] = default_index_coordinate(dim, size) - - updated_coord_names = set(coords) | set(dims) + if dim in variables: + new_coord_names.add(dim) self._data._variables = variables - self._data._coord_names.update(updated_coord_names) + self._data._coord_names.update(new_coord_names) self._data._dims = dict(dims) def __delitem__(self, key): @@ -189,11 +186,14 @@ def __init__(self, dataarray): def _names(self): return set(self._data._coords) + def __getitem__(self, key): + return self._data._getitem_coord(key) + def _update_coords(self, coords): from .dataset import calculate_dimensions dims = calculate_dimensions(coords) - if set(dims) != set(self.dims): + if not set(dims) <= set(self.dims): raise ValueError('cannot add coordinates with new dimensions to ' 'a DataArray') self._data._coords = coords @@ -212,64 +212,58 @@ def to_dataset(self): return self._to_dataset() def __delitem__(self, key): - if key in self.dims: - raise ValueError('cannot delete a coordinate corresponding to a ' - 'DataArray dimension') del self._data._coords[key] -class LevelCoordinates(AbstractCoordinates): - """Dictionary like container for MultiIndex level coordinates. +class LevelCoordinatesSource(object): + """Iterator for MultiIndex level coordinates. - Used for attribute style lookup. Not returned directly by any - public methods. + Used for attribute style lookup with AttrAccessMixin. Not returned directly + by any public methods. """ - def __init__(self, dataarray): - self._data = dataarray + def __init__(self, data_object): + self._data = data_object - @property - def _names(self): - return set(self._data._level_coords) + def __getitem__(self, key): + # not necessary -- everything here can already be found in coords. + raise KeyError - @property - def variables(self): - level_coords = OrderedDict( - (k, self._data[v].variable.get_level_variable(k)) - for k, v in self._data._level_coords.items()) - return Frozen(level_coords) + def __iter__(self): + return iter(self._data._level_coords) class Indexes(Mapping, formatting.ReprMixin): """Ordered Mapping[str, pandas.Index] for xarray objects. """ - def __init__(self, variables, dims): + def __init__(self, variables, sizes): """Not for public consumption. Arguments --------- - variables : OrderedDict + variables : OrderedDict[Any, Variable] Reference to OrderedDict holding variable objects. Should be the same dictionary used by the source object. - dims : sequence or mapping - Should be the same dimensions used by the source object. + sizes : OrderedDict[Any, int] + Map from dimension names to sizes. """ self._variables = variables - self._dims = dims + self._sizes = sizes def __iter__(self): - return iter(self._dims) + for key in self._sizes: + if key in self._variables: + yield key def __len__(self): - return len(self._dims) + return sum(key in self._variables for key in self._sizes) def __contains__(self, key): - return key in self._dims + return key in self._sizes and key in self._variables def __getitem__(self, key): - if key in self: - return self._variables[key].to_index() - else: + if key not in self._sizes: raise KeyError(key) + return self._variables[key].to_index() def __unicode__(self): return formatting.indexes_repr(self) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index fdbb8c773f6..3b35fd5b026 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -14,14 +14,14 @@ from . import rolling from . import ops from . import utils -from .alignment import align +from .alignment import align, reindex_like_indexers from .common import AbstractArray, BaseDataObject -from .coordinates import (DataArrayCoordinates, LevelCoordinates, +from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource, Indexes) from .dataset import Dataset -from .pycompat import iteritems, basestring, OrderedDict, zip -from .variable import (as_variable, Variable, as_compatible_data, IndexVariable, - default_index_coordinate, +from .pycompat import iteritems, basestring, OrderedDict, zip, range +from .variable import (as_variable, Variable, as_compatible_data, + IndexVariable, assert_unique_multiindex_level_names) from .formatting import format_item from .utils import decode_numpy_dict_values, ensure_us_time_resolution @@ -71,10 +71,6 @@ def _infer_coords_and_dims(shape, coords, dims): var.dims = (dim,) new_coords[dim] = var - for dim, size in zip(dims, shape): - if dim not in new_coords: - new_coords[dim] = default_index_coordinate(dim, size) - sizes = dict(zip(dims, shape)) for k, v in new_coords.items(): if any(d not in dims for d in v.dims): @@ -225,6 +221,9 @@ def __init__(self, data, coords=None, dims=None, name=None, coords, dims = _infer_coords_and_dims(data.shape, coords, dims) variable = Variable(dims, data, attrs, encoding, fastpath=True) + # uncomment for a useful consistency check: + # assert all(isinstance(v, Variable) for v in coords.values()) + # These fully describe a DataArray self._variable = variable self._coords = coords @@ -234,7 +233,6 @@ def __init__(self, data, coords=None, dims=None, name=None, self._initialized = True - __default = object() def _replace(self, variable=None, coords=None, name=__default): @@ -285,14 +283,17 @@ def _from_temp_dataset(self, dataset, name=__default): def _to_dataset_split(self, dim): def subset(dim, label): - array = self.loc[{dim: label}].drop(dim) + array = self.loc[{dim: label}] + if dim in array.coords: + del array.coords[dim] array.attrs = {} return array variables = OrderedDict([(label, subset(dim, label)) - for label in self.indexes[dim]]) + for label in self.get_index(dim)]) coords = self.coords.to_dataset() - del coords[dim] + if dim in coords: + del coords[dim] return Dataset(variables, coords, self.attrs) def _to_dataset_whole(self, name=None, shallow_copy=True): @@ -448,17 +449,21 @@ def _level_coords(self): level_coords.update({lname: dim for lname in level_names}) return level_coords - def __getitem__(self, key): - if isinstance(key, basestring): - from .dataset import _get_virtual_variable + def _getitem_coord(self, key): + from .dataset import _get_virtual_variable - try: - var = self._coords[key] - except KeyError: - _, key, var = _get_virtual_variable( - self._coords, key, self._level_coords) + try: + var = self._coords[key] + except KeyError: + dim_sizes = dict(zip(self.dims, self.shape)) + _, key, var = _get_virtual_variable( + self._coords, key, self._level_coords, dim_sizes) + + return self._replace_maybe_drop_dims(var, name=key) - return self._replace_maybe_drop_dims(var, name=key) + def __getitem__(self, key): + if isinstance(key, basestring): + return self._getitem_coord(key) else: # orthogonal array indexing return self.isel(**self._item_key_to_dict(key)) @@ -476,7 +481,7 @@ def __delitem__(self, key): @property def _attr_sources(self): """List of places to look-up items for attribute-style access""" - return [self.coords, LevelCoordinates(self), self.attrs] + return [self.coords, LevelCoordinatesSource(self), self.attrs] def __contains__(self, key): return key in self._coords @@ -510,7 +515,7 @@ def encoding(self, value): def indexes(self): """OrderedDict of pandas.Index objects used for label based indexing """ - return Indexes(self._coords, self.dims) + return Indexes(self._coords, self.sizes) @property def coords(self): @@ -720,8 +725,7 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): DataArray.reindex align """ - indexers = dict((k, v) for k, v in other.indexes.items() - if k in self.dims) + indexers = reindex_like_indexers(self, other) return self.reindex(method=method, tolerance=tolerance, copy=copy, **indexers) @@ -1044,7 +1048,8 @@ def to_pandas(self): except KeyError: raise ValueError('cannot convert arrays with %s dimensions into ' 'pandas objects' % self.ndim) - return constructor(self.values, *self.indexes.values()) + indexes = [self.get_index(dim) for dim in self.dims] + return constructor(self.values, *indexes) def to_dataframe(self, name=None): """Convert this array and its coordinates into a tidy pandas.DataFrame. @@ -1157,10 +1162,10 @@ def to_netcdf(self, *args, **kwargs): """ from ..backends.api import DATAARRAY_NAME, DATAARRAY_VARIABLE - if not self.name: + if self.name is None: # If no name is set then use a generic xarray name dataset = self.to_dataset(name=DATAARRAY_VARIABLE) - elif self.name in list(self.coords): + elif self.name in self.coords or self.name in self.dims: # The name is the same as one of the coords names, which netCDF # doesn't support, so rename it but keep track of the old name dataset = self.to_dataset(name=DATAARRAY_VARIABLE) @@ -1614,7 +1619,8 @@ def dot(self, other): axes = (self.get_axis_num(dims), other.get_axis_num(dims)) new_data = ops.tensordot(self.data, other.data, axes=axes) - new_coords = self.coords.merge(other.coords).drop(dims) + new_coords = self.coords.merge(other.coords) + new_coords = new_coords.drop([d for d in dims if d in new_coords]) new_dims = ([d for d in self.dims if d not in dims] + [d for d in other.dims if d not in dims]) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 964bd3dbb7a..da57aa14c17 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2,7 +2,6 @@ from __future__ import division from __future__ import print_function import functools -import warnings from collections import Mapping from numbers import Number @@ -11,14 +10,13 @@ from . import ops from . import utils -from . import common from . import groupby from . import indexing from . import alignment from . import formatting from .. import conventions from .alignment import align -from .coordinates import DatasetCoordinates, LevelCoordinates, Indexes +from .coordinates import DatasetCoordinates, LevelCoordinatesSource, Indexes from .common import ImplementsDatasetReduce, BaseDataObject from .merge import (dataset_update_method, dataset_merge_method, merge_data_and_coords) @@ -26,7 +24,7 @@ decode_numpy_dict_values, ensure_us_time_resolution) from .variable import (Variable, as_variable, IndexVariable, broadcast_variables) from .pycompat import (iteritems, basestring, OrderedDict, - dask_array_type) + dask_array_type, range) from .combine import concat from .options import OPTIONS @@ -38,10 +36,20 @@ 'quarter'] -def _get_virtual_variable(variables, key, level_vars={}): +def _get_virtual_variable(variables, key, level_vars=None, dim_sizes=None): """Get a virtual variable (e.g., 'time.year' or a MultiIndex level) from a dict of xarray.Variable objects (if possible) """ + if level_vars is None: + level_vars = {} + if dim_sizes is None: + dim_sizes = {} + + if key in dim_sizes: + data = pd.Index(range(dim_sizes[key]), name=key) + variable = IndexVariable((key,), data) + return key, key, variable + if not isinstance(key, basestring): raise KeyError(key) @@ -225,13 +233,6 @@ def __init__(self, data_vars=None, coords=None, attrs=None, self.attrs = attrs self._initialized = True - def _add_missing_coords_inplace(self): - """Add missing coordinates to self._variables - """ - for dim, size in iteritems(self.dims): - if dim not in self._variables: - self._variables[dim] = default_index_coordinate(dim, size) - def _set_init_vars_and_dims(self, data_vars, coords, compat): """Set the initial value of Dataset variables and dimensions """ @@ -475,9 +476,9 @@ def _copy_listed(self, names): variables[name] = self._variables[name] except KeyError: ref_name, var_name, var = _get_virtual_variable( - self._variables, name, self._level_coords) + self._variables, name, self._level_coords, self.dims) variables[var_name] = var - if ref_name in self._coord_names: + if ref_name in self._coord_names or ref_name in self.dims: coord_names.add(var_name) return self._subset_with_all_valid_coords(variables, coord_names, @@ -492,7 +493,7 @@ def _construct_dataarray(self, name): variable = self._variables[name] except KeyError: _, name, variable = _get_virtual_variable( - self._variables, name, self._level_coords) + self._variables, name, self._level_coords, self.dims) coords = OrderedDict() needed_dims = set(variable.dims) @@ -513,7 +514,7 @@ def __deepcopy__(self, memo=None): @property def _attr_sources(self): """List of places to look-up items for attribute-style access""" - return [self, LevelCoordinates(self), self.attrs] + return [self, LevelCoordinatesSource(self), self.attrs] def __contains__(self, key): """The 'in' operator will return true or false depending on whether @@ -571,22 +572,9 @@ def __setitem__(self, key, value): def __delitem__(self, key): """Remove a variable from this dataset. - - If this variable is a dimension, all variables containing this - dimension are also removed. """ - def remove(k): - del self._variables[k] - self._coord_names.discard(k) - - remove(key) - - if key in self._dims: - del self._dims[key] - also_delete = [k for k, v in iteritems(self._variables) - if key in v.dims] - for key in also_delete: - remove(key) + del self._variables[key] + self._coord_names.discard(key) # mutable objects should not be hashable __hash__ = None @@ -1180,8 +1168,7 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): Dataset.reindex align """ - indexers = dict((k, v) for k, v in other.indexes.items() - if k in self.dims) + indexers = alignment.reindex_like_indexers(self, other) return self.reindex(method=method, copy=copy, tolerance=tolerance, **indexers) @@ -1236,8 +1223,11 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, **kw_in raise ValueError('invalid reindex dimensions: %s' % bad_dims) variables = alignment.reindex_variables( - self.variables, self.indexes, indexers, method, tolerance, copy=copy) - return self._replace_vars_and_dims(variables) + self.variables, self.sizes, self.indexes, indexers, method, + tolerance, copy=copy) + coord_names = set(self._coord_names) + coord_names.update(indexers) + return self._replace_vars_and_dims(variables, coord_names) def rename(self, name_dict, inplace=False): """Returns a new object with renamed variables and dimensions. @@ -1263,9 +1253,9 @@ def rename(self, name_dict, inplace=False): DataArray.rename """ for k, v in name_dict.items(): - if k not in self: + if k not in self and k not in self.dims: raise ValueError("cannot rename %r because it is not a " - "variable in this dataset" % k) + "variable or dimension in this dataset" % k) if v in self and k != v: raise ValueError('the new name %r already exists' % v) @@ -1280,7 +1270,10 @@ def rename(self, name_dict, inplace=False): if k in self._coord_names: coord_names.add(name) - return self._replace_vars_and_dims(variables, coord_names, + dims = OrderedDict((name_dict.get(k, k), v) + for k, v in self.dims.items()) + + return self._replace_vars_and_dims(variables, coord_names, dims=dims, inplace=inplace) def swap_dims(self, dims_dict, inplace=False): @@ -1349,8 +1342,16 @@ def _stack_once(self, dims, new_dim): else: variables[name] = var.copy(deep=False) - idx = utils.multiindex_from_product_levels( - [self.indexes[d] for d in dims], names=dims) + # consider dropping levels that are unused? + levels = [self.get_index(dim) for dim in dims] + if hasattr(pd, 'RangeIndex'): + # RangeIndex levels in a MultiIndex are broken for appending in + # pandas before v0.19.0 + levels = [pd.Int64Index(level) + if isinstance(level, pd.RangeIndex) + else level + for level in levels] + idx = utils.multiindex_from_product_levels(levels, names=dims) variables[new_dim] = IndexVariable(new_dim, idx) coord_names = set(self._coord_names) - set(dims) | set([new_dim]) @@ -1408,7 +1409,7 @@ def unstack(self, dim): if dim not in self.dims: raise ValueError('invalid dimension: %s' % dim) - index = self.indexes[dim] + index = self.get_index(dim) if not isinstance(index, pd.MultiIndex): raise ValueError('cannot unstack a dimension that does not have ' 'a MultiIndex') @@ -1530,9 +1531,6 @@ def _assert_all_in_dataset(self, names, virtual_okay=False): def drop(self, labels, dim=None): """Drop variables or index labels from this dataset. - If a variable corresponding to a dimension is dropped, all variables - that use that dimension are also dropped. - Parameters ---------- labels : scalar or list of scalars @@ -1550,14 +1548,17 @@ def drop(self, labels, dim=None): if dim is None: return self._drop_vars(labels) else: - new_index = self.indexes[dim].drop(labels) + try: + index = self.indexes[dim] + except KeyError: + raise ValueError( + 'dimension %r does not have coordinate labels' % dim) + new_index = index.drop(labels) return self.loc[{dim: new_index}] def _drop_vars(self, names): self._assert_all_in_dataset(names) drop = set(names) - drop |= set(k for k, v in iteritems(self._variables) - if any(name in v.dims for name in names)) variables = OrderedDict((k, v) for k, v in iteritems(self._variables) if k not in drop) coord_names = set(k for k in self._coord_names if k in variables) @@ -2091,6 +2092,7 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars): new_vars = OrderedDict((k, f(self.variables[k], other_variable)) for k in self.data_vars) ds._variables.update(new_vars) + ds._dims = calculate_dimensions(ds._variables) return ds def _copy_attrs_from(self, other): diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index ee9bb0e296c..48ebbab2a30 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -11,8 +11,8 @@ from .common import ( ImplementsArrayReduce, ImplementsDatasetReduce, _maybe_promote, ) -from .pycompat import zip -from .utils import peek_at, maybe_wrap_array, safe_cast_to_index +from .pycompat import range, zip +from .utils import hashable, peek_at, maybe_wrap_array, safe_cast_to_index from .variable import as_variable, Variable, IndexVariable @@ -73,6 +73,7 @@ def _dummy_copy(xarray_obj): raise AssertionError return res + def _is_one_or_none(obj): return obj == 1 or obj is None @@ -120,6 +121,46 @@ def _inverse_permutation_indices(positions): return indices +class _DummyGroup(object): + """Class for keeping track of grouped dimensions without coordinates. + + Should not be user visible. + """ + + def __init__(self, obj, name, coords): + self.name = name + self.coords = coords + self.dims = (name,) + self.ndim = 1 + self.size = obj.sizes[name] + self.values = range(self.size) + + +def _ensure_1d(group, obj): + if group.ndim != 1: + # try to stack the dims of the group into a single dim + orig_dims = group.dims + stacked_dim = 'stacked_' + '_'.join(orig_dims) + # these dimensions get created by the stack operation + inserted_dims = [dim for dim in group.dims if dim not in group.coords] + # the copy is necessary here, otherwise read only array raises error + # in pandas: https://github.com/pydata/pandas/issues/12813 + group = group.stack(**{stacked_dim: orig_dims}).copy() + obj = obj.stack(**{stacked_dim: orig_dims}) + else: + stacked_dim = None + inserted_dims = [] + return group, obj, stacked_dim, inserted_dims + + +def _unique_and_monotonic(group): + if isinstance(group, _DummyGroup): + return True + else: + index = safe_cast_to_index(group) + return index.is_unique and index.is_monotonic + + class GroupBy(object): """A object that implements the split-apply-combine pattern. @@ -137,15 +178,15 @@ class GroupBy(object): DataArray.groupby """ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, - cut_kwargs={}): + cut_kwargs={}): """Create a GroupBy object Parameters ---------- obj : Dataset or DataArray Object to group. - group : DataArray or IndexVariable - 1-dimensional array with the group values. + group : DataArray + Array with the group values. squeeze : boolean, optional If "group" is a coordinate of object, `squeeze` controls whether the subarrays have a dimension of length 1 along that coordinate or @@ -159,26 +200,24 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, Extra keyword arguments to pass to `pandas.cut` """ - from .dataset import as_dataset from .dataarray import DataArray + if grouper is not None and bins is not None: + raise TypeError("can't specify both `grouper` and `bins`") + + if not isinstance(group, (DataArray, IndexVariable)): + if not hashable(group): + raise TypeError('`group` must be an xarray.DataArray or the ' + 'name of an xarray variable or dimension') + group = obj[group] + if group.name not in obj and group.name in obj.dims: + # DummyGroups should not appear on groupby results + group = _DummyGroup(obj, group.name, group.coords) + if getattr(group, 'name', None) is None: raise ValueError('`group` must have a name') - self._stacked_dim = None - if group.ndim != 1: - # try to stack the dims of the group into a single dim - # TODO: figure out how to exclude dimensions from the stacking - # (e.g. group over space dims but leave time dim intact) - orig_dims = group.dims - stacked_dim_name = 'stacked_' + '_'.join(orig_dims) - # the copy is necessary here, otherwise read only array raises error - # in pandas: https://github.com/pydata/pandas/issues/12813 - group = group.stack(**{stacked_dim_name: orig_dims}).copy() - obj = obj.stack(**{stacked_dim_name: orig_dims}) - self._stacked_dim = stacked_dim_name - self._unstacked_dims = orig_dims - if not hasattr(group, 'dims'): - raise ValueError("`group` must have a 'dims' attribute") + + group, obj, stacked_dim, inserted_dims = _ensure_1d(group, obj) group_dim, = group.dims expected_size = obj.sizes[group_dim] @@ -186,84 +225,87 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, raise ValueError('the group variable\'s length does not ' 'match the length of this variable along its ' 'dimension') + full_index = None - if grouper is not None and bins is not None: - raise TypeError("Can't specify both `grouper` and `bins`.") if bins is not None: binned = pd.cut(group.values, bins, **cut_kwargs) new_dim_name = group.name + '_bins' group = DataArray(binned, group.coords, name=new_dim_name) full_index = binned.categories + if grouper is not None: index = safe_cast_to_index(group) if not index.is_monotonic: # TODO: sort instead of raising an error raise ValueError('index must be monotonic for resampling') s = pd.Series(np.arange(index.size), index) - if grouper is not None: - first_items = s.groupby(grouper).first() + first_items = s.groupby(grouper).first() if first_items.isnull().any(): full_index = first_items.index first_items = first_items.dropna() sbins = first_items.values.astype(np.int64) - group_indices = ([slice(i, j) for i, j in zip(sbins[:-1], sbins[1:])] + + group_indices = ([slice(i, j) + for i, j in zip(sbins[:-1], sbins[1:])] + [slice(sbins[-1], None)]) unique_coord = IndexVariable(group.name, first_items.index) - elif group.name in obj.dims and bins is None: - # assume that group already has sorted, unique values - # (if using bins, the group will have the same name as a dimension - # but different values) - if group.dims != (group.name,): - raise ValueError('`group` is required to be a coordinate if ' - '`group.name` is a dimension in `obj`') + elif group.dims == (group.name,) and _unique_and_monotonic(group): + # no need to factorize group_indices = np.arange(group.size) if not squeeze: - # group_indices = group_indices.reshape(-1, 1) # use slices to do views instead of fancy indexing + # equivalent to: group_indices = group_indices.reshape(-1, 1) group_indices = [slice(i, i + 1) for i in group_indices] unique_coord = group else: # look through group to find the unique values - sort = bins is None - unique_values, group_indices = unique_value_groups(group, sort=sort) + unique_values, group_indices = unique_value_groups( + group, sort=(bins is None)) unique_coord = IndexVariable(group.name, unique_values) - self.obj = obj - self.group = group - self.group_dim = group_dim - self.group_indices = group_indices - self.unique_coord = unique_coord - self._groups = None + # specification for the groupby operation + self._obj = obj + self._group = group + self._group_dim = group_dim + self._group_indices = group_indices + self._unique_coord = unique_coord + self._stacked_dim = stacked_dim + self._inserted_dims = inserted_dims self._full_index = full_index + # cached attributes + self._groups = None + @property def groups(self): # provided to mimic pandas.groupby if self._groups is None: - self._groups = dict(zip(self.unique_coord.values, - self.group_indices)) + self._groups = dict(zip(self._unique_coord.values, + self._group_indices)) return self._groups def __len__(self): - return self.unique_coord.size + return self._unique_coord.size def __iter__(self): - return zip(self.unique_coord.values, self._iter_grouped()) + return zip(self._unique_coord.values, self._iter_grouped()) def _iter_grouped(self): """Iterate over each element in this group""" - for indices in self.group_indices: - yield self.obj.isel(**{self.group_dim: indices}) + for indices in self._group_indices: + yield self._obj.isel(**{self._group_dim: indices}) def _infer_concat_args(self, applied_example): - if self.group_dim in applied_example.dims: - concat_dim = self.group - positions = self.group_indices + if self._group_dim in applied_example.dims: + coord = self._group + positions = self._group_indices else: - concat_dim = self.unique_coord + coord = self._unique_coord positions = None - return concat_dim, positions + dim, = coord.dims + if isinstance(coord, _DummyGroup): + coord = None + return coord, dim, positions @staticmethod def _binary_op(f, reflexive=False, **ignored_kwargs): @@ -271,7 +313,7 @@ def _binary_op(f, reflexive=False, **ignored_kwargs): def func(self, other): g = f if not reflexive else lambda x, y: f(y, x) applied = self._yield_binary_applied(g, other) - combined = self._concat(applied) + combined = self._combine(applied) return combined return func @@ -280,17 +322,17 @@ def _yield_binary_applied(self, func, other): for group_value, obj in self: try: - other_sel = other.sel(**{self.group.name: group_value}) + other_sel = other.sel(**{self._group.name: group_value}) except AttributeError: raise TypeError('GroupBy objects only support binary ops ' 'when the other argument is a Dataset or ' 'DataArray') except (KeyError, ValueError): - if self.group.name not in other.dims: + if self._group.name not in other.dims: raise ValueError('incompatible dimensions for a grouped ' 'binary operation: the group variable %r ' 'is not a dimension on the other argument' - % self.group.name) + % self._group.name) if dummy is None: dummy = _dummy_copy(other) other_sel = dummy @@ -302,17 +344,21 @@ def _maybe_restore_empty_groups(self, combined): """Our index contained empty groups (e.g., from a resampling). If we reduced on that dimension, we want to restore the full index. """ - if (self._full_index is not None and self.group.name in combined.dims): - indexers = {self.group.name: self._full_index} + if (self._full_index is not None and + self._group.name in combined.dims): + indexers = {self._group.name: self._full_index} combined = combined.reindex(**indexers) return combined - def _maybe_unstack_array(self, arr): + def _maybe_unstack(self, obj): """This gets called if we are applying on an array with a multidimensional group.""" - if self._stacked_dim is not None and self._stacked_dim in arr.dims: - arr = arr.unstack(self._stacked_dim) - return arr + if self._stacked_dim is not None and self._stacked_dim in obj.dims: + obj = obj.unstack(self._stacked_dim) + for dim in self._inserted_dims: + if dim in obj.coords: + del obj.coords[dim] + return obj def fillna(self, value): """Fill missing values in this object by group. @@ -360,11 +406,11 @@ def where(self, cond): return self._where(cond) def _first_or_last(self, op, skipna, keep_attrs): - if isinstance(self.group_indices[0], (int, np.integer)): + if isinstance(self._group_indices[0], (int, np.integer)): # NB. this is currently only used for reductions along an existing # dimension - return self.obj - return self.reduce(op, self.group_dim, skipna=skipna, + return self._obj + return self.reduce(op, self._group_dim, skipna=skipna, keep_attrs=keep_attrs, allow_lazy=True) def first(self, skipna=None, keep_attrs=True): @@ -387,13 +433,12 @@ def assign_coords(self, **kwargs): return self.apply(lambda ds: ds.assign_coords(**kwargs)) -def _maybe_reorder(xarray_obj, concat_dim, positions): +def _maybe_reorder(xarray_obj, dim, positions): order = _inverse_permutation_indices(positions) if order is None: return xarray_obj else: - dim, = concat_dim.dims return xarray_obj[{dim: order}] @@ -404,27 +449,26 @@ def _iter_grouped_shortcut(self): """Fast version of `_iter_grouped` that yields Variables without metadata """ - var = self.obj.variable - for indices in self.group_indices: - yield var[{self.group_dim: indices}] + var = self._obj.variable + for indices in self._group_indices: + yield var[{self._group_dim: indices}] - def _concat_shortcut(self, applied, concat_dim, positions=None): + def _concat_shortcut(self, applied, dim, positions=None): # nb. don't worry too much about maintaining this method -- it does # speed things up, but it's not very interpretable and there are much # faster alternatives (e.g., doing the grouped aggregation in a # compiled language) - stacked = Variable.concat(applied, concat_dim, shortcut=True) - reordered = _maybe_reorder(stacked, concat_dim, positions) - result = self.obj._replace_maybe_drop_dims(reordered) - result._coords[concat_dim.name] = as_variable(concat_dim, copy=True) + stacked = Variable.concat(applied, dim, shortcut=True) + reordered = _maybe_reorder(stacked, dim, positions) + result = self._obj._replace_maybe_drop_dims(reordered) return result def _restore_dim_order(self, stacked): def lookup_order(dimension): - if dimension == self.group.name: - dimension, = self.group.dims - if dimension in self.obj.dims: - axis = self.obj.get_axis_num(dimension) + if dimension == self._group.name: + dimension, = self._group.dims + if dimension in self._obj.dims: + axis = self._obj.get_axis_num(dimension) else: axis = 1e6 # some arbitrarily high value return axis @@ -432,12 +476,6 @@ def lookup_order(dimension): new_order = sorted(stacked.dims, key=lookup_order) return stacked.transpose(*new_order) - def _restore_multiindex(self, combined): - if self._stacked_dim is not None and self._stacked_dim in combined.dims: - stacked_dim = self.group[self._stacked_dim] - combined[self._stacked_dim] = stacked_dim - return combined - def apply(self, func, shortcut=False, **kwargs): """Apply a function over each array in the group and concatenate them together into a new array. @@ -472,31 +510,37 @@ def apply(self, func, shortcut=False, **kwargs): Returns ------- - applied : DataArray + applied : DataArray or DataArray The result of splitting, applying and combining this array. """ if shortcut: grouped = self._iter_grouped_shortcut() else: grouped = self._iter_grouped() - applied = (maybe_wrap_array(arr, func(arr, **kwargs)) for arr in grouped) - combined = self._concat(applied, shortcut=shortcut) - result = self._maybe_restore_empty_groups( - self._maybe_unstack_array(combined)) - return result + applied = (maybe_wrap_array(arr, func(arr, **kwargs)) + for arr in grouped) + return self._combine(applied, shortcut=shortcut) - def _concat(self, applied, shortcut=False): - # peek at applied to determine which coordinate to stack over + def _combine(self, applied, shortcut=False): + """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) - concat_dim, positions = self._infer_concat_args(applied_example) + coord, dim, positions = self._infer_concat_args(applied_example) if shortcut: - combined = self._concat_shortcut(applied, concat_dim, positions) + combined = self._concat_shortcut(applied, dim, positions) else: - combined = concat(applied, concat_dim) - combined = _maybe_reorder(combined, concat_dim, positions) - if isinstance(combined, type(self.obj)): + combined = concat(applied, dim) + combined = _maybe_reorder(combined, dim, positions) + + if isinstance(combined, type(self._obj)): + # only restore dimension order for arrays combined = self._restore_dim_order(combined) - combined = self._restore_multiindex(combined) + if coord is not None: + if shortcut: + combined._coords[coord.name] = as_variable(coord, copy=True) + else: + combined.coords[coord.name] = coord + combined = self._maybe_restore_empty_groups(combined) + combined = self._maybe_unstack(combined) return combined def reduce(self, func, dim=None, axis=None, keep_attrs=False, @@ -562,22 +606,24 @@ def apply(self, func, **kwargs): Returns ------- - applied : Dataset + applied : Dataset or DataArray The result of splitting, applying and combining this dataset. """ kwargs.pop('shortcut', None) # ignore shortcut if set (for now) applied = (func(ds, **kwargs) for ds in self._iter_grouped()) - combined = self._concat(applied) - result = self._maybe_restore_empty_groups(combined) - return result + return self._combine(applied) - def _concat(self, applied): + def _combine(self, applied): + """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) - concat_dim, positions = self._infer_concat_args(applied_example) - - combined = concat(applied, concat_dim) - reordered = _maybe_reorder(combined, concat_dim, positions) - return reordered + coord, dim, positions = self._infer_concat_args(applied_example) + combined = concat(applied, dim) + combined = _maybe_reorder(combined, dim, positions) + if coord is not None: + combined[coord.name] = coord + combined = self._maybe_restore_empty_groups(combined) + combined = self._maybe_unstack(combined) + return combined def reduce(self, func, dim=None, keep_attrs=False, **kwargs): """Reduce the items in this group by applying `func` along some diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 8cbb91ebd4f..7060f6a3960 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -145,24 +145,34 @@ def _is_nested_tuple(possible_tuple): for value in possible_tuple)) -def convert_label_indexer(index, label, index_name='', method=None, - tolerance=None): - """Given a pandas.Index and labels (e.g., from __getitem__) for one - dimension, return an indexer suitable for indexing an ndarray along that - dimension. If `index` is a pandas.MultiIndex and depending on `label`, - return a new pandas.Index or pandas.MultiIndex (otherwise return None). - """ +def _index_method_kwargs(method, tolerance): # backwards compatibility for pandas<0.16 (method) or pandas<0.17 # (tolerance) kwargs = {} if method is not None: kwargs['method'] = method if tolerance is not None: - if pd.__version__ < '0.17': - raise NotImplementedError( - 'the tolerance argument requires pandas v0.17 or newer') kwargs['tolerance'] = tolerance + return kwargs + + +def get_loc(index, label, method=None, tolerance=None): + kwargs = _index_method_kwargs(method, tolerance) + return index.get_loc(label, **kwargs) + + +def get_indexer(index, labels, method=None, tolerance=None): + kwargs = _index_method_kwargs(method, tolerance) + return index.get_indexer(labels, **kwargs) + +def convert_label_indexer(index, label, index_name='', method=None, + tolerance=None): + """Given a pandas.Index and labels (e.g., from __getitem__) for one + dimension, return an indexer suitable for indexing an ndarray along that + dimension. If `index` is a pandas.MultiIndex and depending on `label`, + return a new pandas.Index or pandas.MultiIndex (otherwise return None). + """ new_index = None if isinstance(label, slice): @@ -207,11 +217,11 @@ def convert_label_indexer(index, label, index_name='', method=None, if isinstance(index, pd.MultiIndex): indexer, new_index = index.get_loc_level(label.item(), level=0) else: - indexer = index.get_loc(label.item(), **kwargs) + indexer = get_loc(index, label.item(), method, tolerance) elif label.dtype.kind == 'b': indexer, = np.nonzero(label) else: - indexer = index.get_indexer(label, **kwargs) + indexer = get_indexer(index, label, method, tolerance) if np.any(indexer < 0): raise KeyError('not all values found in index %r' % index_name) @@ -259,14 +269,26 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): if method is not None and not isinstance(method, str): raise TypeError('``method`` must be a string') - pos_indexers, new_indexes = {}, {} - for dim, label in iteritems(get_dim_indexers(data_obj, indexers)): - index = data_obj[dim].to_index() - idxr, new_idx = convert_label_indexer(index, label, - dim, method, tolerance) - pos_indexers[dim] = idxr - if new_idx is not None: - new_indexes[dim] = new_idx + pos_indexers = {} + new_indexes = {} + + dim_indexers = get_dim_indexers(data_obj, indexers) + for dim, label in iteritems(dim_indexers): + try: + index = data_obj.indexes[dim] + except KeyError: + # no index for this dimension: reuse the provided labels + if method is not None or tolerance is not None: + raise ValueError('cannot supply ``method`` or ``tolerance`` ' + 'when the indexed dimension does not have ' + 'an associated coordinate.') + pos_indexers[dim] = label + else: + idxr, new_idx = convert_label_indexer(index, label, + dim, method, tolerance) + pos_indexers[dim] = idxr + if new_idx is not None: + new_indexes[dim] = new_idx return pos_indexers, new_indexes @@ -308,47 +330,6 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): return indexer -class LazyIntegerRange(utils.NDArrayMixin): - - def __init__(self, *args, **kwdargs): - """ - Parameters - ---------- - See np.arange - """ - self.args = args - self.kwdargs = kwdargs - assert 'dtype' not in self.kwdargs - # range will fail if any arguments are not integers - self.array = range(*args, **kwdargs) - - @property - def shape(self): - return (len(self.array),) - - @property - def dtype(self): - return np.dtype('int64') - - @property - def ndim(self): - return 1 - - @property - def size(self): - return len(self.array) - - def __getitem__(self, key): - return np.array(self)[key] - - def __array__(self, dtype=None): - return np.arange(*self.args, **self.kwdargs) - - def __repr__(self): - return ('%s(array=%r)' % - (type(self).__name__, self.array)) - - class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy """ diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 83763d5def6..5ea490a004b 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -4,9 +4,8 @@ import pandas as pd from .alignment import align -from .utils import Frozen, is_dict_like -from .variable import (as_variable, default_index_coordinate, - assert_unique_multiindex_level_names) +from .utils import Frozen +from .variable import (as_variable, assert_unique_multiindex_level_names) from .pycompat import (basestring, OrderedDict) @@ -461,10 +460,8 @@ def merge_core(objs, dims = calculate_dimensions(variables) for dim, size in dims.items(): - if dim not in variables: - variables[dim] = default_index_coordinate(dim, size) - - coord_names.update(dims) + if dim in variables: + coord_names.add(dim) ambiguous_coords = coord_names.intersection(noncoord_names) if ambiguous_coords: diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 6c4ebef9c96..dd12ce76aa0 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -164,10 +164,10 @@ def reduce(self, func, **kwargs): for _, window in self] # Find valid windows based on count + concat_dim = self.window_labels if self.dim in self.obj else self.dim counts = concat([window.count(dim=self.dim) for _, window in self], - dim=self.obj[self.dim]) - - result = concat(windows, dim=self.window_labels) + dim=concat_dim) + result = concat(windows, dim=concat_dim) result = result.where(counts >= self._min_periods) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6aafbcaab82..a4d995cdec0 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -15,8 +15,7 @@ from . import utils from . import nputils from .pycompat import basestring, OrderedDict, zip, dask_array_type -from .indexing import (PandasIndexAdapter, orthogonally_indexable, - LazyIntegerRange) +from .indexing import (PandasIndexAdapter, orthogonally_indexable) import xarray as xr # only for Dataset and DataArray @@ -85,14 +84,6 @@ def as_variable(obj, name=None, copy=False): return obj -def default_index_coordinate(dim, size): - """ - This is equivalent to np.arange(size), but waits to create the array until - its actually accessed. - """ - return IndexVariable(dim, LazyIntegerRange(size)) - - def _maybe_wrap_data(data): """ Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index c1eecda7c69..bf849b979c5 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -157,9 +157,10 @@ def line(darray, *args, **kwargs): if ax is None: ax = plt.gca() - xlabel, x = list(darray.indexes.items())[0] + xlabel, = darray.dims + x = darray.coords[xlabel] - _ensure_plottable([x]) + _ensure_plottable(x) primitive = ax.plot(x, darray, *args, **kwargs) diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index f885521ea03..8e360c71477 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -215,6 +215,6 @@ def _infer_xy_labels(darray, x, y): y, x = darray.dims elif x is None or y is None: raise ValueError('cannot supply only one of x and y') - elif any(k not in darray.coords for k in (x, y)): + elif any(k not in darray.coords and k not in darray.dims for k in (x, y)): raise ValueError('x and y must be coordinate variables') return x, y diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 95af76f58a0..136cbbea78e 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -1091,7 +1091,8 @@ class MiscObject: class TestValidateAttrs(TestCase): def test_validating_attrs(self): def new_dataset(): - return Dataset({'data': ('y', np.arange(10.0))}) + return Dataset({'data': ('y', np.arange(10.0))}, + {'y': np.arange(10)}) def new_dataset_and_dataset_attrs(): ds = new_dataset() diff --git a/xarray/test/test_combine.py b/xarray/test/test_combine.py index 4fca8fcb067..7813378277a 100644 --- a/xarray/test/test_combine.py +++ b/xarray/test/test_combine.py @@ -18,7 +18,10 @@ def test_concat(self): # TODO: simplify and split this test case # drop the third dimension to keep things relatively understandable - data = create_test_data().drop('dim3') + data = create_test_data() + for k in list(data): + if 'dim3' in data[k].dims: + del data[k] split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] @@ -34,19 +37,21 @@ def rectify_dim_order(dataset): for dim in ['dim1', 'dim2']: datasets = [g for _, g in data.groupby(dim, squeeze=False)] self.assertDatasetIdentical(data, concat(datasets, dim)) - self.assertDatasetIdentical( - data, concat(datasets, data[dim])) - self.assertDatasetIdentical( - data, concat(datasets, data[dim], coords='minimal')) - datasets = [g for _, g in data.groupby(dim, squeeze=True)] - concat_over = [k for k, v in iteritems(data.coords) - if dim in v.dims and k != dim] - actual = concat(datasets, data[dim], coords=concat_over) - self.assertDatasetIdentical(data, rectify_dim_order(actual)) + dim = 'dim2' + self.assertDatasetIdentical( + data, concat(datasets, data[dim])) + self.assertDatasetIdentical( + data, concat(datasets, data[dim], coords='minimal')) - actual = concat(datasets, data[dim], coords='different') - self.assertDatasetIdentical(data, rectify_dim_order(actual)) + datasets = [g for _, g in data.groupby(dim, squeeze=True)] + concat_over = [k for k, v in iteritems(data.coords) + if dim in v.dims and k != dim] + actual = concat(datasets, data[dim], coords=concat_over) + self.assertDatasetIdentical(data, rectify_dim_order(actual)) + + actual = concat(datasets, data[dim], coords='different') + self.assertDatasetIdentical(data, rectify_dim_order(actual)) # make sure the coords argument behaves as expected data.coords['extra'] = ('dim4', np.arange(3)) @@ -114,7 +119,8 @@ def test_concat_autoalign(self): ds2 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 3])])}) actual = concat([ds1, ds2], 'y') expected = Dataset({'foo': DataArray([[1, 2, np.nan], [1, np.nan, 2]], - dims=['y', 'x'], coords={'y': [0, 1], 'x': [1, 2, 3]})}) + dims=['y', 'x'], + coords={'x': [1, 2, 3]})}) self.assertDatasetIdentical(expected, actual) def test_concat_errors(self): @@ -187,26 +193,27 @@ def test_concat_promote_shape(self): objs = [Dataset({'x': [0]}, {'y': -1}), Dataset({'x': [1, 2]}, {'y': -2})] actual = concat(objs, 'x') - expected = Dataset({}, {'y': ('x', [-1, -2, -2])}) + expected = Dataset({'x': [0, 1, 2]}, {'y': ('x', [-1, -2, -2])}) self.assertDatasetIdentical(actual, expected) # broadcast 1d x 1d -> 2d objs = [Dataset({'z': ('x', [-1])}, {'x': [0], 'y': [0]}), Dataset({'z': ('y', [1])}, {'x': [1], 'y': [0]})] actual = concat(objs, 'x') - expected = Dataset({'z': (('x', 'y'), [[-1], [1]])}) + expected = Dataset({'z': (('x', 'y'), [[-1], [1]])}, + {'x': [0, 1], 'y': [0]}) self.assertDatasetIdentical(actual, expected) def test_concat_do_not_promote(self): # GH438 - objs = [Dataset({'y': ('t', [1])}, {'x': 1}), - Dataset({'y': ('t', [2])}, {'x': 1})] + objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), + Dataset({'y': ('t', [2])}, {'x': 1, 't': [0]})] expected = Dataset({'y': ('t', [1, 2])}, {'x': 1, 't': [0, 0]}) actual = concat(objs, 't') self.assertDatasetIdentical(expected, actual) - objs = [Dataset({'y': ('t', [1])}, {'x': 1}), - Dataset({'y': ('t', [2])}, {'x': 2})] + objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), + Dataset({'y': ('t', [2])}, {'x': 2, 't': [0]})] with self.assertRaises(ValueError): concat(objs, 't', coords='minimal') @@ -228,14 +235,15 @@ def test_concat_multiindex(self): class TestConcatDataArray(TestCase): def test_concat(self): - ds = Dataset({'foo': (['x', 'y'], np.random.random((10, 20))), - 'bar': (['x', 'y'], np.random.random((10, 20)))}) + ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), + 'bar': (['x', 'y'], np.random.random((2, 3)))}, + {'x': [0, 1]}) foo = ds['foo'] bar = ds['bar'] # from dataset array: expected = DataArray(np.array([foo.values, bar.values]), - dims=['w', 'x', 'y']) + dims=['w', 'x', 'y'], coords={'x': [0, 1]}) actual = concat([foo, bar], 'w') self.assertDataArrayEqual(expected, actual) # from iteration: @@ -294,8 +302,7 @@ def test_auto_combine(self): objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] actual = auto_combine(objs) - expected = Dataset( - {'x': ('a', [0, 1]), 'y': ('a', [0, 1]), 'a': [0, 0]}) + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) self.assertDatasetIdentical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] @@ -317,7 +324,8 @@ def test_auto_combine_previously_failed(self): datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), Dataset({'b': ('x', [0]), 'x': [0]}), Dataset({'a': ('x', [1]), 'x': [1]})] - expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}) + expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, + {'x': [0, 1]}) actual = auto_combine(datasets) self.assertDatasetIdentical(expected, actual) @@ -326,7 +334,8 @@ def test_auto_combine_previously_failed(self): datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] expected = Dataset({'a': (('t', 'x'), - [[np.nan, 2, 3], [1, 2, np.nan]])}) + [[np.nan, 2, 3], [1, 2, np.nan]])}, + {'x': [0, 1, 2]}) actual = auto_combine(datasets, concat_dim='t') self.assertDatasetIdentical(expected, actual) diff --git a/xarray/test/test_conventions.py b/xarray/test/test_conventions.py index be328eb0406..12424e93dcd 100644 --- a/xarray/test/test_conventions.py +++ b/xarray/test/test_conventions.py @@ -191,11 +191,11 @@ def test_cf_datetime(self): @requires_netCDF4 def test_decode_cf_datetime_overflow(self): - # checks for + # checks for # https://github.com/pydata/pandas/issues/14068 # https://github.com/pydata/xarray/issues/975 - from datetime import datetime + from datetime import datetime units = 'days since 2000-01-01 00:00:00' # date after 2262 and before 1678 @@ -577,7 +577,6 @@ def test_decode_cf_with_drop_variables(self): }) expected = Dataset({ 't': pd.date_range('2000-01-01', periods=3), - 'x': ("x", [0, 1, 2]), 'foo': (('t', 'x'), [[0, 0, 0], [1, 1, 1], [2, 2, 2]], {'units': 'bar'}), 'y': ('t', [5, 10, np.nan]) }) diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 305332f8a68..461e05b17fd 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -206,8 +206,10 @@ def assertLazyAndAllClose(self, expected, actual): def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) - self.eager_array = DataArray(self.values, dims=('x', 'y'), name='foo') - self.lazy_array = DataArray(self.data, dims=('x', 'y'), name='foo') + self.eager_array = DataArray(self.values, coords={'x': range(4)}, + dims=('x', 'y'), name='foo') + self.lazy_array = DataArray(self.data, coords={'x': range(4)}, + dims=('x', 'y'), name='foo') def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) @@ -255,8 +257,8 @@ def test_groupby_first(self): self.assertLazyAndAllClose(expected, actual) def test_reindex(self): - u = self.eager_array - v = self.lazy_array + u = self.eager_array.assign_coords(y=range(6)) + v = self.lazy_array.assign_coords(y=range(6)) for kwargs in [{'x': [2, 3, 4]}, {'x': [1, 100, 2, 101, 3]}, @@ -316,8 +318,7 @@ def test_stack(self): stacked = arr.stack(z=('x', 'y')) z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=['x', 'y']) - expected = DataArray(data.reshape(2, -1), {'w': [0, 1], 'z': z}, - dims=['w', 'z']) + expected = DataArray(data.reshape(2, -1), {'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks self.assertLazyAndIdentical(expected, stacked) @@ -329,6 +330,7 @@ def test_dot(self): def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() - a = DataArray(self.lazy_array.variable) + a = DataArray(self.lazy_array.variable, + coords={'x': range(4)}, name='foo') self.assertLazyAndIdentical(self.lazy_array, a) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index d5ee9851469..0ed22440d54 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -14,8 +14,8 @@ from xarray.core.pycompat import iteritems, OrderedDict from xarray.core.common import _full_like -from xarray.test import (TestCase, ReturnItem, source_ndarray, unittest, requires_dask, - requires_bottleneck) +from xarray.test import (TestCase, ReturnItem, source_ndarray, unittest, + requires_dask, requires_bottleneck) class TestDataArray(TestCase): @@ -33,15 +33,15 @@ def setUp(self): def test_repr(self): v = Variable(['time', 'x'], [[1, 2, 3], [4, 5, 6]], {'foo': 'bar'}) - data_array = DataArray(v, {'other': np.int64(0)}, name='my_variable') + coords = OrderedDict([('x', [0, 1, 2]), ('other', np.int64(0))]) + data_array = DataArray(v, coords, name='my_variable') expected = dedent("""\ array([[1, 2, 3], [4, 5, 6]]) Coordinates: - other int64 0 - * time (time) int64 0 1 * x (x) int64 0 1 2 + other int64 0 Attributes: foo: bar""") self.assertEqual(expected, repr(data_array)) @@ -83,6 +83,23 @@ def test_data_property(self): self.assertArrayEqual(2 * np.ones((3, 4)), actual.data) self.assertArrayEqual(actual.data, actual.values) + def test_indexes(self): + array = DataArray(np.zeros((2, 3)), + [('x', [0, 1]), ('y', ['a', 'b', 'c'])]) + expected = OrderedDict([('x', pd.Index([0, 1])), + ('y', pd.Index(['a', 'b', 'c']))]) + assert array.indexes.keys() == expected.keys() + for k in expected: + assert array.indexes[k].equals(expected[k]) + + def test_get_index(self): + array = DataArray(np.zeros((2, 3)), coords={'x': ['a', 'b']}, + dims=['x', 'y']) + assert array.get_index('x').equals(pd.Index(['a', 'b'])) + assert array.get_index('y').equals(pd.Index([0, 1, 2])) + with self.assertRaises(KeyError): + array.get_index('z') + def test_struct_array_dims(self): """ This test checks subraction of two DataArrays for the case @@ -291,6 +308,7 @@ def test_constructor_from_self_described(self): panel = pd.Panel({0: frame}) actual = DataArray(panel) expected = DataArray([data], expected.coords, ['dim_0', 'x', 'y']) + expected['dim_0'] = [0] self.assertDataArrayIdentical(expected, actual) expected = DataArray(data, @@ -480,6 +498,14 @@ def test_sel(self): self.assertDataArrayIdentical(da[1], da.sel(x=b)) self.assertDataArrayIdentical(da[[1]], da.sel(x=slice(b, b))) + def test_sel_no_index(self): + array = DataArray(np.arange(10), dims='x') + self.assertDataArrayIdentical(array[0], array.sel(x=0)) + self.assertDataArrayIdentical(array[:5], array.sel(x=slice(5))) + self.assertDataArrayIdentical(array[[0, -1]], array.sel(x=[0, -1])) + self.assertDataArrayIdentical( + array[array < 5], array.sel(x=(array < 5))) + def test_sel_method(self): data = DataArray(np.random.randn(3, 4), [('x', [0, 1, 2]), ('y', list('abcd'))]) @@ -493,25 +519,26 @@ def test_sel_method(self): actual = data.sel(x=[0.9, 1.9], method='backfill', tolerance=1) self.assertDataArrayIdentical(expected, actual) else: - with self.assertRaisesRegexp(NotImplementedError, 'tolerance'): + with self.assertRaisesRegexp(TypeError, 'tolerance'): data.sel(x=[0.9, 1.9], method='backfill', tolerance=1) def test_isel_points(self): shape = (10, 5, 6) np_array = np.random.random(shape) - da = DataArray(np_array, dims=['time', 'y', 'x']) + da = DataArray(np_array, dims=['time', 'y', 'x'], + coords={'time': np.arange(0, 100, 10)}) y = [1, 3] x = [3, 0] expected = da.values[:, y, x] actual = da.isel_points(y=y, x=x, dim='test_coord') - assert 'test_coord' in actual.coords assert actual.coords['test_coord'].shape == (len(y), ) - assert all(x in actual for x in ['time', 'x', 'y', 'test_coord']) + assert list(actual.coords) == ['time'] assert actual.dims == ('test_coord', 'time') + actual = da.isel_points(y=y, x=x) - assert 'points' in actual.coords + assert 'points' in actual.dims # Note that because xarray always concatenates along the first # dimension, We must transpose the result to match the numpy style of # concatenation. @@ -619,7 +646,13 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, self.assertDataArrayIdentical(mdata.sel(x={'one': 'a', 'two': 1}), mdata.sel(one='a', two=1)) - def test_time_components(self): + def test_virtual_default_coords(self): + array = DataArray(np.zeros((5,)), dims='x') + expected = DataArray(range(5), dims='x', name='x') + self.assertDataArrayIdentical(expected, array['x']) + self.assertDataArrayIdentical(expected, array.coords['x']) + + def test_virtual_time_components(self): dates = pd.date_range('2000-01-01', periods=10) da = DataArray(np.arange(1, 11), [('time', dates)]) @@ -655,11 +688,10 @@ def test_coords(self): actual = repr(da.coords) self.assertEquals(expected, actual) - with self.assertRaisesRegexp(ValueError, 'cannot delete'): - del da['x'] - - with self.assertRaisesRegexp(ValueError, 'cannot delete'): - del da.coords['x'] + del da.coords['x'] + expected = DataArray(da.values, {'y': [0, 1, 2]}, dims=['x', 'y'], + name='foo') + self.assertDataArrayIdentical(da, expected) with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): self.mda['level_1'] = np.arange(4) @@ -689,14 +721,16 @@ def test_coord_coords(self): def test_reset_coords(self): data = DataArray(np.zeros((3, 4)), {'bar': ('x', ['a', 'b', 'c']), - 'baz': ('y', range(4))}, + 'baz': ('y', range(4)), + 'y': range(4)}, dims=['x', 'y'], name='foo') actual = data.reset_coords() expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 4))), 'bar': ('x', ['a', 'b', 'c']), - 'baz': ('y', range(4))}) + 'baz': ('y', range(4)), + 'y': range(4)}) self.assertDatasetIdentical(actual, expected) actual = data.reset_coords(['bar', 'baz']) @@ -705,14 +739,15 @@ def test_reset_coords(self): actual = data.reset_coords('bar') expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 4))), 'bar': ('x', ['a', 'b', 'c'])}, - {'baz': ('y', range(4))}) + {'baz': ('y', range(4)), 'y': range(4)}) self.assertDatasetIdentical(actual, expected) actual = data.reset_coords(['bar']) self.assertDatasetIdentical(actual, expected) actual = data.reset_coords(drop=True) - expected = DataArray(np.zeros((3, 4)), dims=['x', 'y'], name='foo') + expected = DataArray(np.zeros((3, 4)), coords={'y': range(4)}, + dims=['x', 'y'], name='foo') self.assertDataArrayIdentical(actual, expected) actual = data.copy() @@ -720,7 +755,8 @@ def test_reset_coords(self): self.assertDataArrayIdentical(actual, expected) actual = data.reset_coords('bar', drop=True) - expected = DataArray(np.zeros((3, 4)), {'baz': ('y', range(4))}, + expected = DataArray(np.zeros((3, 4)), + {'baz': ('y', range(4)), 'y': range(4)}, dims=['x', 'y'], name='foo') self.assertDataArrayIdentical(actual, expected) @@ -753,7 +789,9 @@ def test_coords_alignment(self): rhs = DataArray([2, 3, 4], [('x', [1, 2, 3])]) lhs.coords['rhs'] = rhs - expected = DataArray([1, 2, 3], coords={'rhs': ('x', [np.nan, 2, 3])}, + expected = DataArray([1, 2, 3], + coords={'rhs': ('x', [np.nan, 2, 3]), + 'x': [0, 1, 2]}, dims='x') self.assertDataArrayIdentical(lhs, expected) @@ -765,9 +803,16 @@ def test_coords_replacement_alignment(self): expected = DataArray([0, 1, 2], coords=[('abc', [1, 2, 3])]) self.assertDataArrayIdentical(arr, expected) - def test_reindex(self): - foo = self.dv - bar = self.dv[:2, :2] + def test_coords_non_string(self): + arr = DataArray(0, coords={1: 2}) + actual = arr.coords[1] + expected = DataArray(2, coords={1: 2}, name=1) + self.assertDataArrayIdentical(actual, expected) + + def test_reindex_like(self): + foo = DataArray(np.random.randn(5, 6), + [('x', range(5)), ('y', range(6))]) + bar = foo[:2, :2] self.assertDataArrayIdentical(foo.reindex_like(bar), bar) expected = foo.copy() @@ -775,8 +820,18 @@ def test_reindex(self): expected[:2, :2] = bar self.assertDataArrayIdentical(bar.reindex_like(foo), expected) + def test_reindex_like_no_index(self): + foo = DataArray(np.random.randn(5, 6), dims=['x', 'y']) + self.assertDatasetIdentical(foo, foo.reindex_like(foo)) + + bar = foo[:4] + with self.assertRaisesRegexp( + ValueError, 'different size for unlabeled'): + foo.reindex_like(bar) + + def test_reindex_regressions(self): # regression test for #279 - expected = DataArray(np.random.randn(5), dims=["time"]) + expected = DataArray(np.random.randn(5), coords=[("time", range(5))]) time2 = DataArray(np.arange(5), dims="time2") actual = expected.reindex(time=time2) self.assertDataArrayIdentical(actual, expected) @@ -789,7 +844,7 @@ def test_reindex(self): self.assertEqual(x.dtype, re_dtype) def test_reindex_method(self): - x = DataArray([10, 20], dims='y') + x = DataArray([10, 20], dims='y', coords={'y': [0, 1]}) y = [-0.1, 0.5, 1.1] if pd.__version__ >= '0.17': actual = x.reindex(y=y, method='backfill', tolerance=0.2) @@ -814,9 +869,7 @@ def test_rename(self): def test_swap_dims(self): array = DataArray(np.random.randn(3), {'y': ('x', list('abc'))}, 'x') - expected = DataArray(array.values, - {'y': list('abc'), 'x': ('y', range(3))}, - dims='y') + expected = DataArray(array.values, {'y': list('abc')}, dims='y') actual = array.swap_dims({'x': 'y'}) self.assertDataArrayIdentical(expected, actual) @@ -972,14 +1025,12 @@ def test_math_with_coords(self): def test_index_math(self): orig = DataArray(range(3), dims='x', name='x') actual = orig + 1 - expected = DataArray(1 + np.arange(3), coords=[('x', range(3))], - name='x') + expected = DataArray(1 + np.arange(3), dims='x', name='x') self.assertDataArrayIdentical(expected, actual) # regression tests for #254 actual = orig[0] < orig - expected = DataArray([False, True, True], coords=[('x', range(3))], - name='x') + expected = DataArray([False, True, True], dims='x', name='x') self.assertDataArrayIdentical(expected, actual) actual = orig > orig[0] @@ -1029,7 +1080,7 @@ def test_dataset_math(self): def test_stack_unstack(self): orig = DataArray([[0, 1], [2, 3]], dims=['x', 'y'], attrs={'foo': 2}) - actual = orig.stack(z=['x', 'y']).unstack('z') + actual = orig.stack(z=['x', 'y']).unstack('z').drop(['x', 'y']) self.assertDataArrayIdentical(orig, actual) def test_stack_unstack_decreasing_coordinate(self): @@ -1075,7 +1126,8 @@ def test_drop_coordinates(self): renamed.drop('foo') def test_drop_index_labels(self): - arr = DataArray(np.random.randn(2, 3), dims=['x', 'y']) + arr = DataArray(np.random.randn(2, 3), coords={'y': [0, 1, 2]}, + dims=['x', 'y']) actual = arr.drop([0, 1], dim='y') expected = arr[:, 2:] self.assertDataArrayIdentical(expected, actual) @@ -1099,6 +1151,12 @@ def test_dropna(self): expected = arr[:, 1:] self.assertDataArrayIdentical(actual, expected) + def test_where(self): + arr = DataArray(np.arange(4), dims='x') + expected = arr.sel(x=slice(2)) + actual = arr.where(arr.x < 2, drop=True) + self.assertDataArrayIdentical(actual, expected) + def test_cumops(self): coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), @@ -1165,12 +1223,12 @@ def test_reduce_keep_attrs(self): self.assertEqual(vm.attrs, self.attrs) def test_fillna(self): - a = DataArray([np.nan, 1, np.nan, 3], dims='x') + a = DataArray([np.nan, 1, np.nan, 3], coords={'x': range(4)}, dims='x') actual = a.fillna(-1) - expected = DataArray([-1, 1, -1, 3], dims='x') + expected = DataArray([-1, 1, -1, 3], coords={'x': range(4)}, dims='x') self.assertDataArrayIdentical(expected, actual) - b = DataArray(range(4), dims='x') + b = DataArray(range(4), coords={'x': range(4)}, dims='x') actual = a.fillna(b) expected = b.copy() self.assertDataArrayIdentical(expected, actual) @@ -1192,7 +1250,8 @@ def test_fillna(self): fill_value = DataArray([0, 1], dims='y') actual = a.fillna(fill_value) - expected = DataArray([[0, 1], [1, 1], [0, 1], [3, 3]], dims=('x', 'y')) + expected = DataArray([[0, 1], [1, 1], [0, 1], [3, 3]], + coords={'x': range(4)}, dims=('x', 'y')) self.assertDataArrayIdentical(expected, actual) expected = b.copy() @@ -1217,8 +1276,10 @@ def make_groupby_example_array(self): def test_groupby_properties(self): grouped = self.make_groupby_example_array().groupby('abc') - expected_unique = Variable('abc', ['a', 'b', 'c']) - self.assertVariableEqual(expected_unique, grouped.unique_coord) + expected_groups = {'a': range(0, 9), 'c': [9], 'b': range(10, 20)} + self.assertItemsEqual(expected_groups.keys(), grouped.groups.keys()) + for key in expected_groups: + self.assertArrayEqual(expected_groups[key], grouped.groups[key]) self.assertEqual(3, len(grouped)) def test_groupby_apply_identity(self): @@ -1259,7 +1320,6 @@ def test_groupby_sum(self): {'foo': (['x', 'abc'], np.array([self.x[:, :9].sum(1), self.x[:, 10:].sum(1), self.x[:, 9:10].sum(1)]).T), - 'x': self.ds['x'], 'abc': Variable(['abc'], np.array(['a', 'b', 'c']))})['foo'] self.assertDataArrayAllClose(expected_sum_axis1, grouped.reduce(np.sum, 'y')) @@ -1357,18 +1417,20 @@ def test_groupby_math(self): array += grouped def test_groupby_math_not_aligned(self): - array = DataArray(range(4), {'b': ('x', [0, 0, 1, 1])}, dims='x') - other = DataArray([10], dims='b') + array = DataArray(range(4), {'b': ('x', [0, 0, 1, 1]), + 'x': [0, 1, 2, 3]}, + dims='x') + other = DataArray([10], coords={'b': [0]}, dims='b') actual = array.groupby('b') + other expected = DataArray([10, 11, np.nan, np.nan], array.coords) self.assertDataArrayIdentical(expected, actual) - other = DataArray([10], coords={'c': 123}, dims='b') + other = DataArray([10], coords={'c': 123, 'b': [0]}, dims='b') actual = array.groupby('b') + other expected.coords['c'] = (['x'], [123] * 2 + [np.nan] * 2) self.assertDataArrayIdentical(expected, actual) - other = Dataset({'a': ('b', [10])}) + other = Dataset({'a': ('b', [10])}, {'b': [0]}) actual = array.groupby('b') + other expected = Dataset({'a': ('x', [10, 11, np.nan, np.nan])}, array.coords) @@ -1407,26 +1469,26 @@ def test_groupby_first_and_last(self): self.assertDataArrayIdentical(expected, actual) def make_groupby_multidim_example_array(self): - return DataArray([[[0,1],[2,3]],[[5,10],[15,20]]], - coords={'lon': (['ny', 'nx'], [[30., 40.], [40., 50.]] ), - 'lat': (['ny', 'nx'], [[10., 10.], [20., 20.]] ),}, - dims=['time', 'ny', 'nx']) + return DataArray([[[0, 1], [2, 3]], [[5, 10], [15, 20]]], + coords={'lon': (['ny', 'nx'], [[30, 40], [40, 50]]), + 'lat': (['ny', 'nx'], [[10, 10], [20, 20]])}, + dims=['time', 'ny', 'nx']) def test_groupby_multidim(self): array = self.make_groupby_multidim_example_array() for dim, expected_sum in [ - ('lon', DataArray([5, 28, 23], coords=[('lon', [30., 40., 50.])])), + ('lon', DataArray([5, 28, 23], + coords=[('lon', [30., 40., 50.])])), ('lat', DataArray([16, 40], coords=[('lat', [10., 20.])]))]: actual_sum = array.groupby(dim).sum() self.assertDataArrayIdentical(expected_sum, actual_sum) def test_groupby_multidim_apply(self): array = self.make_groupby_multidim_example_array() - actual = array.groupby('lon').apply( - lambda x : x - x.mean(), shortcut=False) + actual = array.groupby('lon').apply(lambda x: x - x.mean()) expected = DataArray([[[-2.5, -6.], [-5., -8.5]], - [[ 2.5, 3.], [ 8., 8.5]]], - coords=array.coords, dims=array.dims) + [[2.5, 3.], [8., 8.5]]], + coords=array.coords, dims=array.dims) self.assertDataArrayIdentical(expected, actual) def test_groupby_bins(self): @@ -1435,46 +1497,42 @@ def test_groupby_bins(self): array[0] = 99 # bins follow conventions for pandas.cut # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html - bins = [0,1.5,5] + bins = [0, 1.5, 5] bin_coords = ['(0, 1.5]', '(1.5, 5]'] - expected = DataArray([1,5], dims='dim_0_bins', - coords={'dim_0_bins': bin_coords}) + expected = DataArray([1, 5], dims='dim_0_bins', + coords={'dim_0_bins': bin_coords}) # the problem with this is that it overwrites the dimensions of array! - #actual = array.groupby('dim_0', bins=bins).sum() - actual = array.groupby_bins('dim_0', bins).apply( - lambda x : x.sum(), shortcut=False) + # actual = array.groupby('dim_0', bins=bins).sum() + actual = array.groupby_bins('dim_0', bins).apply(lambda x: x.sum()) self.assertDataArrayIdentical(expected, actual) # make sure original array dims are unchanged - # (would fail with shortcut=True above) self.assertEqual(len(array.dim_0), 4) def test_groupby_bins_empty(self): - array = DataArray(np.arange(4), dims='dim_0') + array = DataArray(np.arange(4), [('x', range(4))]) # one of these bins will be empty - bins = [0,4,5] - actual = array.groupby_bins('dim_0', bins).sum() - expected = DataArray([6, np.nan], dims='dim_0_bins', - coords={'dim_0_bins': ['(0, 4]','(4, 5]']}) + bins = [0, 4, 5] + actual = array.groupby_bins('x', bins).sum() + expected = DataArray([6, np.nan], dims='x_bins', + coords={'x_bins': ['(0, 4]', '(4, 5]']}) self.assertDataArrayIdentical(expected, actual) # make sure original array is unchanged # (was a problem in earlier versions) - self.assertEqual(len(array.dim_0), 4) + self.assertEqual(len(array.x), 4) def test_groupby_bins_multidim(self): array = self.make_groupby_multidim_example_array() bins = [0,15,20] bin_coords = ['(0, 15]', '(15, 20]'] expected = DataArray([16, 40], dims='lat_bins', - coords={'lat_bins': bin_coords}) - actual = array.groupby_bins('lat', bins).apply( - lambda x : x.sum(), shortcut=False) + coords={'lat_bins': bin_coords}) + actual = array.groupby_bins('lat', bins).apply(lambda x: x.sum()) self.assertDataArrayIdentical(expected, actual) # modify the array coordinates to be non-monotonic after unstacking array['lat'].data = np.array([[10., 20.], [20., 10.]]) expected = DataArray([28, 28], dims='lat_bins', - coords={'lat_bins': bin_coords}) - actual = array.groupby_bins('lat', bins).apply( - lambda x : x.sum(), shortcut=False) + coords={'lat_bins': bin_coords}) + actual = array.groupby_bins('lat', bins).apply(lambda x: x.sum()) self.assertDataArrayIdentical(expected, actual) def test_groupby_bins_sort(self): @@ -1693,10 +1751,11 @@ def test_resample_upsampling(self): self.assertDataArrayIdentical(expected, actual) def test_align(self): - self.ds['x'] = ('x', np.array(list('abcdefghij'))) - dv1, dv2 = align(self.dv, self.dv[:5], join='inner') - self.assertDataArrayIdentical(dv1, self.dv[:5]) - self.assertDataArrayIdentical(dv2, self.dv[:5]) + array = DataArray(np.random.random((6, 8)), + coords={'x': list('abcdef')}, dims=['x', 'y']) + array1, array2 = align(array, array[:5], join='inner') + self.assertDataArrayIdentical(array1, array[:5]) + self.assertDataArrayIdentical(array2, array[:5]) def test_align_dtype(self): # regression test for #264 @@ -1767,6 +1826,34 @@ def test_align_indexes(self): coords=[('a', [-2, 7, 10, -1])]) self.assertDataArrayIdentical(expected_x2, x2) + def test_align_without_indexes_exclude(self): + arrays = [DataArray([1, 2, 3], dims=['x']), + DataArray([1, 2], dims=['x'])] + result0, result1 = align(*arrays, exclude=['x']) + self.assertDatasetIdentical(result0, arrays[0]) + self.assertDatasetIdentical(result1, arrays[1]) + + def test_align_mixed_indexes(self): + array_no_coord = DataArray([1, 2], dims=['x']) + array_with_coord = DataArray([1, 2], coords=[('x', ['a', 'b'])]) + result0, result1 = align(array_no_coord, array_with_coord) + self.assertDatasetIdentical(result0, array_with_coord) + self.assertDatasetIdentical(result1, array_with_coord) + + result0, result1 = align(array_no_coord, array_with_coord, + exclude=['x']) + self.assertDatasetIdentical(result0, array_no_coord) + self.assertDatasetIdentical(result1, array_with_coord) + + def test_align_without_indexes_errors(self): + with self.assertRaisesRegexp(ValueError, 'cannot be aligned'): + align(DataArray([1, 2, 3], dims=['x']), + DataArray([1, 2], dims=['x'])) + + with self.assertRaisesRegexp(ValueError, 'cannot be aligned'): + align(DataArray([1, 2, 3], dims=['x']), + DataArray([1, 2], coords=[('x', [0, 1])])) + def test_broadcast_arrays(self): x = DataArray([1, 2], coords=[('a', [-1, -2])], name='x') y = DataArray([1, 2], coords=[('b', [3, 4])], name='y') @@ -1871,7 +1958,7 @@ def test_to_pandas(self): for shape in [(3,), (3, 4), (3, 4, 5)]: dims = list('abc')[:len(shape)] da = DataArray(np.random.randn(*shape), dims=dims) - roundtripped = DataArray(da.to_pandas()) + roundtripped = DataArray(da.to_pandas()).drop(dims) self.assertDataArrayIdentical(da, roundtripped) with self.assertRaisesRegexp(ValueError, 'cannot convert'): @@ -1921,12 +2008,15 @@ def test_to_and_from_series(self): self.assertArrayEqual(expected.index.values, actual.index.values) self.assertEqual('foo', actual.name) # test roundtrip - self.assertDataArrayIdentical(self.dv, DataArray.from_series(actual)) + self.assertDataArrayIdentical( + self.dv, + DataArray.from_series(actual).drop(['x', 'y'])) # test name is None actual.name = None expected_da = self.dv.rename(None) - self.assertDataArrayIdentical(expected_da, - DataArray.from_series(actual)) + self.assertDataArrayIdentical( + expected_da, + DataArray.from_series(actual).drop(['x', 'y'])) def test_series_categorical_index(self): # regression test for GH700 @@ -1938,47 +2028,47 @@ def test_series_categorical_index(self): assert "'a'" in repr(arr) # should not error def test_to_and_from_dict(self): + array = DataArray(np.random.randn(2, 3), {'x': ['a', 'b']}, ['x', 'y'], + name='foo') expected = {'name': 'foo', 'dims': ('x', 'y'), - 'data': self.x.tolist(), + 'data': array.values.tolist(), 'attrs': {}, - 'coords': {'y': {'dims': ('y',), - 'data': list(range(20)), - 'attrs': {}}, - 'x': {'dims': ('x',), - 'data': list(range(10)), + 'coords': {'x': {'dims': ('x',), + 'data': ['a', 'b'], 'attrs': {}}}} - actual = self.dv.to_dict() + actual = array.to_dict() # check that they are identical self.assertEqual(expected, actual) # check roundtrip - self.assertDataArrayIdentical(self.dv, DataArray.from_dict(actual)) + self.assertDataArrayIdentical(array, DataArray.from_dict(actual)) # a more bare bones representation still roundtrips d = {'name': 'foo', 'dims': ('x', 'y'), - 'data': self.x, - 'coords': {'y': {'dims': 'y', 'data': list(range(20))}, - 'x': {'dims': 'x', 'data': list(range(10))}}} - self.assertDataArrayIdentical(self.dv, DataArray.from_dict(d)) + 'data': array.values.tolist(), + 'coords': {'x': {'dims': 'x', 'data': ['a', 'b']}}} + self.assertDataArrayIdentical(array, DataArray.from_dict(d)) # and the most bare bones representation still roundtrips - d = {'name': 'foo', 'dims': ('x', 'y'), 'data': self.x} - self.assertDataArrayIdentical(self.dv, DataArray.from_dict(d)) + d = {'name': 'foo', 'dims': ('x', 'y'), 'data': array.values} + self.assertDataArrayIdentical(array.drop('x'), DataArray.from_dict(d)) # missing a dims in the coords d = {'dims': ('x', 'y'), - 'data': self.x, - 'coords': {'y': {'data': list(range(20))}, - 'x': {'dims': 'x', 'data': list(range(10))}}} - with self.assertRaisesRegexp(ValueError, "cannot convert dict when coords are missing the key 'dims'"): + 'data': array.values, + 'coords': {'x': {'data': ['a', 'b']}}} + with self.assertRaisesRegexp( + ValueError, + "cannot convert dict when coords are missing the key 'dims'"): DataArray.from_dict(d) # this one is missing some necessary information d = {'dims': ('t')} - with self.assertRaisesRegexp(ValueError, "cannot convert dict without the key 'data'"): + with self.assertRaisesRegexp( + ValueError, "cannot convert dict without the key 'data'"): DataArray.from_dict(d) def test_to_and_from_dict_with_time_dim(self): @@ -2140,14 +2230,15 @@ def test_to_dataset_retains_keys(self): self.assertDatasetEqual(array, result) def test__title_for_slice(self): - array = DataArray(np.ones((4, 3, 2)), dims=['a', 'b', 'c']) + array = DataArray(np.ones((4, 3, 2)), dims=['a', 'b', 'c'], + coords={'a': range(4), 'b': range(3), 'c': range(2)}) self.assertEqual('', array._title_for_slice()) self.assertEqual('c = 0', array.isel(c=0)._title_for_slice()) title = array.isel(b=1, c=0)._title_for_slice() self.assertTrue('b = 1, c = 0' == title or 'c = 0, b = 1' == title) a2 = DataArray(np.ones((4, 1)), dims=['a', 'b']) - self.assertEqual('b = [0]', a2._title_for_slice()) + self.assertEqual('', a2._title_for_slice()) def test__title_for_slice_truncate(self): array = DataArray(np.ones((4))) @@ -2161,11 +2252,9 @@ def test__title_for_slice_truncate(self): self.assertTrue(title.endswith('...')) def test_dataarray_diff_n1(self): - da = self.ds['foo'] + da = DataArray(np.random.randn(3, 4), dims=['x', 'y']) actual = da.diff('y') - expected = DataArray(np.diff(da.values, axis=1), - [da['x'].values, da['y'].values[1:]], - ['x', 'y']) + expected = DataArray(np.diff(da.values, axis=1), dims=['x', 'y']) self.assertDataArrayEqual(expected, actual) def test_coordinate_diff(self): @@ -2175,6 +2264,7 @@ def test_coordinate_diff(self): expected = DataArray([1] * 9, dims=['lon'], coords=[range(1, 10)], name='lon') actual = lon.diff('lon') + self.assertDataArrayEqual(expected, actual) def test_shift(self): arr = DataArray([1, 2, 3], dims='x') @@ -2182,13 +2272,14 @@ def test_shift(self): expected = DataArray([np.nan, 1, 2], dims='x') self.assertDataArrayIdentical(expected, actual) + arr = DataArray([1, 2, 3], [('x', ['a', 'b', 'c'])]) for offset in [-5, -2, -1, 0, 1, 2, 5]: expected = DataArray(arr.to_pandas().shift(offset)) actual = arr.shift(x=offset) self.assertDataArrayIdentical(expected, actual) def test_roll(self): - arr = DataArray([1, 2, 3], dims='x') + arr = DataArray([1, 2, 3], coords={'x': range(3)}, dims='x') actual = arr.roll(x=1) expected = DataArray([3, 1, 2], coords=[('x', [2, 0, 1])]) self.assertDataArrayIdentical(expected, actual) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 5b8af6b6437..cb985123b4b 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -37,7 +37,6 @@ def create_test_data(seed=None): obj = Dataset() obj['time'] = ('time', pd.date_range('2000-01-01', periods=20)) - obj['dim1'] = ('dim1', np.arange(_dims['dim1'], dtype='int64')) obj['dim2'] = ('dim2', 0.5 * np.arange(_dims['dim2'])) obj['dim3'] = ('dim3', list('abcdefghij')) for v, dims in sorted(_vars.items()): @@ -74,7 +73,6 @@ def test_repr(self): Dimensions: (dim1: 8, dim2: 9, dim3: 10, time: 20) Coordinates: * time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 ... - * dim1 (dim1) int64 0 1 2 3 4 5 6 7 * dim2 (dim2) float64 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) %s 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' numbers (dim3) int64 0 1 2 0 0 1 1 2 2 3 @@ -233,7 +231,8 @@ def test_constructor_auto_align(self): # verify align uses outer join expected = Dataset({'a': ('x', [1, 2, np.nan]), - 'b': ('x', [np.nan, 3, 4])}) + 'b': ('x', [np.nan, 3, 4])}, + {'x': [0, 1, 2]}) actual = Dataset({'a': a, 'b': b}) self.assertDatasetIdentical(expected, actual) @@ -263,12 +262,14 @@ def test_constructor_pandas_sequence(self): (var_name, ds[var_name].to_pandas()) for var_name in ['foo','bar'] ) ds_based_on_pandas = Dataset(pandas_objs, ds.coords, attrs=ds.attrs) + del ds_based_on_pandas['x'] self.assertDatasetEqual(ds, ds_based_on_pandas) # reindex pandas obj, check align works rearranged_index = reversed(pandas_objs['foo'].index) pandas_objs['foo'] = pandas_objs['foo'].reindex(rearranged_index) ds_based_on_pandas = Dataset(pandas_objs, ds.coords, attrs=ds.attrs) + del ds_based_on_pandas['x'] self.assertDatasetEqual(ds, ds_based_on_pandas) def test_constructor_pandas_single(self): @@ -303,14 +304,14 @@ def test_constructor_compat(self): self.assertDatasetIdentical(expected, actual) original = Dataset({'a': (('x', 'y'), np.ones((2, 3)))}, - {'c': (('x', 'y'), np.zeros((2, 3)))}) + {'c': (('x', 'y'), np.zeros((2, 3))), 'x': [0, 1]}) expected = Dataset({'a': ('x', np.ones(2)), 'b': ('y', np.ones(3))}, - {'c': (('x', 'y'), np.zeros((2, 3)))}) + {'c': (('x', 'y'), np.zeros((2, 3))), 'x': [0, 1]}) # use an OrderedDict to ensure test results are reproducible; otherwise # the order of appearance of x and y matters for the order of # dimensions in 'c' - actual = Dataset(OrderedDict([('a', original['a'][:, 0].drop('y')), + actual = Dataset(OrderedDict([('a', original['a'][:, 0]), ('b', original['a'][0].drop('x'))])) self.assertDatasetIdentical(expected, actual) @@ -329,7 +330,7 @@ def test_constructor_with_coords(self): ds = Dataset({}, {'a': ('x', [1])}) self.assertFalse(ds.data_vars) - self.assertItemsEqual(ds.coords.keys(), ['x', 'a']) + self.assertItemsEqual(ds.coords.keys(), ['a']) mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], names=('level_1', 'level_2')) @@ -354,7 +355,7 @@ def test_properties(self): self.assertItemsEqual(ds.keys(), list(ds.variables)) self.assertNotIn('aasldfjalskdfj', ds.variables) self.assertIn('dim1', repr(ds.variables)) - self.assertEqual(len(ds), 8) + self.assertEqual(len(ds), 7) self.assertItemsEqual(ds.data_vars, ['var1', 'var2', 'var3']) self.assertItemsEqual(ds.data_vars.keys(), ['var1', 'var2', 'var3']) @@ -363,20 +364,28 @@ def test_properties(self): self.assertNotIn('numbers', ds.data_vars) self.assertEqual(len(ds.data_vars), 3) - self.assertItemsEqual(ds.indexes, ['dim1', 'dim2', 'dim3', 'time']) - self.assertEqual(len(ds.indexes), 4) - self.assertIn('dim1', repr(ds.indexes)) + self.assertItemsEqual(ds.indexes, ['dim2', 'dim3', 'time']) + self.assertEqual(len(ds.indexes), 3) + self.assertIn('dim2', repr(ds.indexes)) - self.assertItemsEqual(ds.coords, - ['time', 'dim1', 'dim2', 'dim3', 'numbers']) - self.assertIn('dim1', ds.coords) + self.assertItemsEqual(ds.coords, ['time', 'dim2', 'dim3', 'numbers']) + self.assertIn('dim2', ds.coords) self.assertIn('numbers', ds.coords) self.assertNotIn('var1', ds.coords) - self.assertEqual(len(ds.coords), 5) + self.assertNotIn('dim1', ds.coords) + self.assertEqual(len(ds.coords), 4) self.assertEqual(Dataset({'x': np.int64(1), 'y': np.float32([1, 2])}).nbytes, 16) + def test_get_index(self): + ds = Dataset({'foo': (('x', 'y'), np.zeros((2, 3)))}, + coords={'x': ['a', 'b']}) + assert ds.get_index('x').equals(pd.Index(['a', 'b'])) + assert ds.get_index('y').equals(pd.Index([0, 1, 2])) + with self.assertRaises(KeyError): + ds.get_index('z') + def test_attr_access(self): ds = Dataset({'tmin': ('x', [42], {'units': 'Celcius'})}, attrs={'title': 'My test data'}) @@ -402,9 +411,8 @@ def test_variable(self): self.assertTrue('foo' in a) a['bar'] = (('time', 'x',), d) # order of creation is preserved - self.assertEqual(list(a), ['foo', 'time', 'x', 'bar']) - self.assertTrue(all([a['foo'][i].values == d[i] - for i in np.ndindex(*d.shape)])) + self.assertEqual(list(a), ['foo', 'bar']) + self.assertArrayEqual(a['foo'].values, d) # try to add variable with dim (10,3) with data that's (3,10) with self.assertRaises(ValueError): a['qux'] = (('time', 'x'), d.T) @@ -415,8 +423,7 @@ def test_modify_inplace(self): attributes = {'foo': 'bar'} a['x'] = ('x', vec, attributes) self.assertTrue('x' in a.coords) - self.assertIsInstance(a.coords['x'].to_index(), - pd.Index) + self.assertIsInstance(a.coords['x'].to_index(), pd.Index) self.assertVariableIdentical(a.coords['x'], a.variables['x']) b = Dataset() b['x'] = ('x', vec, attributes) @@ -571,10 +578,12 @@ def test_coords_to_dataset(self): self.assertDatasetIdentical(expected, actual) def test_coords_merge(self): - orig_coords = Dataset(coords={'a': ('x', [1, 2])}).coords - other_coords = Dataset(coords={'b': ('x', ['a', 'b'])}).coords + orig_coords = Dataset(coords={'a': ('x', [1, 2]), 'x': [0, 1]}).coords + other_coords = Dataset(coords={'b': ('x', ['a', 'b']), + 'x': [0, 1]}).coords expected = Dataset(coords={'a': ('x', [1, 2]), - 'b': ('x', ['a', 'b'])}) + 'b': ('x', ['a', 'b']), + 'x': [0, 1]}) actual = orig_coords.merge(other_coords) self.assertDatasetIdentical(expected, actual) actual = other_coords.merge(orig_coords) @@ -749,20 +758,19 @@ def test_isel(self): self.assertEqual({'time': 20, 'dim2': 9, 'dim3': 10}, ret.dims) self.assertItemsEqual(data.data_vars, ret.data_vars) self.assertItemsEqual(data.coords, ret.coords) - self.assertItemsEqual(data.indexes, list(ret.indexes) + ['dim1']) + self.assertItemsEqual(data.indexes, ret.indexes) ret = data.isel(time=slice(2), dim1=0, dim2=slice(5)) self.assertEqual({'time': 2, 'dim2': 5, 'dim3': 10}, ret.dims) self.assertItemsEqual(data.data_vars, ret.data_vars) self.assertItemsEqual(data.coords, ret.coords) - self.assertItemsEqual(data.indexes, list(ret.indexes) + ['dim1']) + self.assertItemsEqual(data.indexes, ret.indexes) ret = data.isel(time=0, dim1=0, dim2=slice(5)) self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dims) self.assertItemsEqual(data.data_vars, ret.data_vars) self.assertItemsEqual(data.coords, ret.coords) - self.assertItemsEqual(data.indexes, - list(ret.indexes) + ['dim1', 'time']) + self.assertItemsEqual(data.indexes, list(ret.indexes) + ['time']) def test_sel(self): data = create_test_data() @@ -805,12 +813,12 @@ def test_isel_points(self): actual = data.isel_points(dim1=pdim1, dim2=pdim2, dim3=pdim3, dim='test_coord') - assert 'test_coord' in actual.coords + assert 'test_coord' in actual.dims assert actual.coords['test_coord'].shape == (len(pdim1), ) actual = data.isel_points(dim1=pdim1, dim2=pdim2) - assert 'points' in actual.coords - np.testing.assert_array_equal(pdim1, actual['dim1']) + assert 'points' in actual.dims + np.testing.assert_array_equal(data['dim2'][pdim2], actual['dim2']) # test that the order of the indexers doesn't matter self.assertDatasetIdentical(data.isel_points(dim1=pdim1, dim2=pdim2), @@ -850,7 +858,7 @@ def test_isel_points(self): dim=stations['station']) assert 'station' in actual.coords assert 'station' in actual.dims - self.assertDataArrayIdentical(actual['station'].drop(['dim1', 'dim2']), + self.assertDataArrayIdentical(actual['station'].drop(['dim2']), stations['station']) # make sure we get the default 'points' coordinate when a list is passed @@ -874,6 +882,10 @@ def test_isel_points(self): def test_sel_points(self): data = create_test_data() + # add in a range() index + data['dim1'] = data.dim1 + print(data) + pdim1 = [1, 2, 3] pdim2 = [4, 5, 1] pdim3 = [1, 2, 3] @@ -884,9 +896,13 @@ def test_sel_points(self): self.assertDatasetIdentical(expected, actual) data = Dataset({'foo': (('x', 'y'), np.arange(9).reshape(3, 3))}) - expected = Dataset({'foo': ('points', [0, 4, 8])}, - {'x': ('points', range(3)), - 'y': ('points', range(3))}) + expected = Dataset({'foo': ('points', [0, 4, 8])}) + actual = data.sel_points(x=[0, 1, 2], y=[0, 1, 2]) + self.assertDatasetIdentical(expected, actual) + + data.coords.update({'x': [0, 1, 2], 'y': [0, 1, 2]}) + expected.coords.update({'x': ('points', [0, 1, 2]), + 'y': ('points', [0, 1, 2])}) actual = data.sel_points(x=[0.1, 1.1, 2.5], y=[0, 1.2, 2.0], method='pad') self.assertDatasetIdentical(expected, actual) @@ -899,16 +915,16 @@ def test_sel_method(self): data = create_test_data() if pd.__version__ >= '0.16': - expected = data.sel(dim1=1) - actual = data.sel(dim1=0.95, method='nearest') + expected = data.sel(dim2=1) + actual = data.sel(dim2=0.95, method='nearest') self.assertDatasetIdentical(expected, actual) if pd.__version__ >= '0.17': - actual = data.sel(dim1=0.95, method='nearest', tolerance=1) + actual = data.sel(dim2=0.95, method='nearest', tolerance=1) self.assertDatasetIdentical(expected, actual) with self.assertRaises(KeyError): - actual = data.sel(dim1=0.5, method='nearest', tolerance=0) + actual = data.sel(dim2=np.pi, method='nearest', tolerance=0) expected = data.sel(dim2=[1.5]) actual = data.sel(dim2=[1.45], method='backfill') @@ -921,6 +937,10 @@ def test_sel_method(self): # this should not pass silently data.sel(data) + # cannot pass method if there is no associated coordinate + with self.assertRaisesRegexp(ValueError, 'cannot supply'): + data.sel(dim1=0, method='nearest') + def test_loc(self): data = create_test_data() expected = data.sel(dim3='a') @@ -999,18 +1019,27 @@ def test_reindex(self): data = create_test_data() self.assertDatasetIdentical(data, data.reindex()) - expected = data.isel(dim1=slice(10)) - actual = data.reindex(dim1=data['dim1'][:10]) + expected = data.assign_coords(dim1=data['dim1']) + actual = data.reindex(dim1=data['dim1']) self.assertDatasetIdentical(actual, expected) - actual = data.reindex(dim1=data['dim1'][:10].values) + actual = data.reindex(dim1=data['dim1'].values) self.assertDatasetIdentical(actual, expected) - actual = data.reindex(dim1=data['dim1'][:10].to_index()) + actual = data.reindex(dim1=data['dim1'].to_index()) + self.assertDatasetIdentical(actual, expected) + + with self.assertRaisesRegexp( + ValueError, 'cannot reindex or align along dimension'): + data.reindex(dim1=data['dim1'][:5]) + + expected = data.isel(dim2=slice(5)) + actual = data.reindex(dim2=data['dim2'][:5]) self.assertDatasetIdentical(actual, expected) # test dict-like argument - actual = data.reindex({'dim1': data['dim1'][:10]}) + actual = data.reindex({'dim2': data['dim2']}) + expected = data self.assertDatasetIdentical(actual, expected) with self.assertRaisesRegexp(ValueError, 'cannot specify both'): data.reindex({'x': 0}, x=0) @@ -1022,20 +1051,22 @@ def test_reindex(self): data.reindex(invalid=0) # out of order - expected = data.sel(dim1=data['dim1'][:10:-1]) - actual = data.reindex(dim1=data['dim1'][:10:-1]) + expected = data.sel(dim2=data['dim2'][:5:-1]) + actual = data.reindex(dim2=data['dim2'][:5:-1]) self.assertDatasetIdentical(actual, expected) # regression test for #279 - expected = Dataset({'x': ('time', np.random.randn(5))}) + expected = Dataset({'x': ('time', np.random.randn(5))}, + {'time': range(5)}) time2 = DataArray(np.arange(5), dims="time2") actual = expected.reindex(time=time2) self.assertDatasetIdentical(actual, expected) # another regression test - ds = Dataset({'foo': (['x', 'y'], np.zeros((3, 4)))}) - expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 2))), - 'x': [0, 1, 3]}) + ds = Dataset({'foo': (['x', 'y'], np.zeros((3, 4)))}, + {'x': range(3), 'y': range(4)}) + expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 2)))}, + {'x': [0, 1, 3], 'y': [0, 1]}) expected['foo'][-1] = np.nan actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) self.assertDatasetIdentical(expected, actual) @@ -1047,7 +1078,7 @@ def test_reindex_variables_copied(self): assert reindexed_data.variables[k] is not data.variables[k] def test_reindex_method(self): - ds = Dataset({'x': ('y', [10, 20])}) + ds = Dataset({'x': ('y', [10, 20]), 'y': [0, 1]}) y = [-0.5, 0.5, 1.5] actual = ds.reindex(y=y, method='backfill') expected = Dataset({'x': ('y', [10, 20, np.nan]), 'y': y}) @@ -1058,7 +1089,7 @@ def test_reindex_method(self): expected = Dataset({'x': ('y', 3 * [np.nan]), 'y': y}) self.assertDatasetIdentical(expected, actual) else: - with self.assertRaisesRegexp(NotImplementedError, 'tolerance'): + with self.assertRaisesRegexp(TypeError, 'tolerance'): ds.reindex(y=y, method='backfill', tolerance=0.1) actual = ds.reindex(y=y, method='pad') @@ -1259,11 +1290,6 @@ def test_drop_variables(self): actual = data.drop(['time']) self.assertDatasetIdentical(expected, actual) - expected = Dataset(dict((k, data[k]) for - k in ['dim2', 'dim3', 'time', 'numbers'])) - actual = data.drop('dim1') - self.assertDatasetIdentical(expected, actual) - with self.assertRaisesRegexp(ValueError, 'cannot be found'): data.drop('not_found_here') @@ -1271,10 +1297,6 @@ def test_drop_index_labels(self): data = Dataset({'A': (['x', 'y'], np.random.randn(2, 3)), 'x': ['a', 'b']}) - actual = data.drop(1, 'y') - expected = data.isel(y=[0, 2]) - self.assertDatasetIdentical(expected, actual) - actual = data.drop(['a'], 'x') expected = data.isel(x=[1]) self.assertDatasetIdentical(expected, actual) @@ -1287,6 +1309,10 @@ def test_drop_index_labels(self): # not contained in axis data.drop(['c'], dim='x') + with self.assertRaisesRegexp( + ValueError, 'does not have coordinate labels'): + data.drop(1, 'y') + def test_copy(self): data = create_test_data() @@ -1406,6 +1432,7 @@ def test_unstack(self): names=['x', 'y']) ds = Dataset({'b': ('z', [0, 1, 2, 3]), 'z': index}) expected = Dataset({'b': (('x', 'y'), [[0, 1], [2, 3]]), + 'x': [0, 1], 'y': ['a', 'b']}) actual = ds.unstack('z') self.assertDatasetIdentical(actual, expected) @@ -1420,6 +1447,7 @@ def test_unstack_errors(self): def test_stack_unstack(self): ds = Dataset({'a': ('x', [0, 1]), 'b': (('x', 'y'), [[0, 1], [2, 3]]), + 'x': [0, 1], 'y': ['a', 'b']}) actual = ds.stack(z=['x', 'y']).unstack('z') assert actual.broadcast_equals(ds) @@ -1451,9 +1479,10 @@ def test_update(self): self.assertDatasetIdentical(expected, actual) def test_update_auto_align(self): - ds = Dataset({'x': ('t', [3, 4])}) + ds = Dataset({'x': ('t', [3, 4])}, {'t': [0, 1]}) - expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan, 5])}) + expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan, 5])}, + {'t': [0, 1]}) actual = ds.copy() other = {'y': ('t', [5]), 't': [1]} with self.assertRaisesRegexp(ValueError, 'conflicting sizes'): @@ -1464,7 +1493,8 @@ def test_update_auto_align(self): actual = ds.copy() other = Dataset({'y': ('t', [5]), 't': [100]}) actual.update(other) - expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan] * 2)}) + expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan] * 2)}, + {'t': [0, 1]}) self.assertDatasetIdentical(expected, actual) def test_getitem(self): @@ -1500,7 +1530,18 @@ def test_getitem_hashable(self): with self.assertRaisesRegexp(KeyError, "('var1', 'var2')"): data[('var1', 'var2')] - def test_virtual_variables(self): + def test_virtual_variables_default_coords(self): + dataset = Dataset({'foo': ('x', range(10))}) + expected = DataArray(range(10), dims='x', name='x') + actual = dataset['x'] + self.assertDataArrayIdentical(expected, actual) + self.assertIsInstance(actual.variable, IndexVariable) + + actual = dataset[['x', 'foo']] + expected = dataset.assign_coords(x=range(10)) + self.assertDatasetIdentical(expected, actual) + + def test_virtual_variables_time(self): # access virtual variables data = create_test_data() expected = DataArray(1 + np.arange(20), coords=[data['time']], @@ -1588,7 +1629,7 @@ def test_setitem(self): with self.assertRaisesRegexp(ValueError, 'already exists as a scalar'): data1['newvar'] = ('scalar', [3, 4, 5]) # can't resize a used dimension - with self.assertRaisesRegexp(ValueError, 'conflicting sizes'): + with self.assertRaisesRegexp(ValueError, 'arguments without labels'): data1['dim1'] = data1['dim1'][:5] # override an existing value data1['A'] = 3 * data2['A'] @@ -1600,6 +1641,7 @@ def test_setitem(self): def test_setitem_pandas(self): ds = self.make_example_math_dataset() + ds['x'] = np.arange(3) ds_copy = ds.copy() ds_copy['bar'] = ds['bar'].to_pandas() @@ -1613,26 +1655,27 @@ def test_setitem_auto_align(self): self.assertDatasetIdentical(ds, expected) ds['y'] = DataArray(range(3), dims='y') - expected = Dataset({'x': ('y', range(3))}) + expected = Dataset({'x': ('y', range(3))}, {'y': range(3)}) self.assertDatasetIdentical(ds, expected) - ds['x'] = DataArray([1, 2], dims='y') - expected = Dataset({'x': ('y', [1, 2, np.nan])}) + ds['x'] = DataArray([1, 2], coords=[('y', [0, 1])]) + expected = Dataset({'x': ('y', [1, 2, np.nan])}, {'y': range(3)}) self.assertDatasetIdentical(ds, expected) ds['x'] = 42 expected = Dataset({'x': 42, 'y': range(3)}) self.assertDatasetIdentical(ds, expected) - ds['x'] = DataArray([4, 5, 6, 7], dims='y') - expected = Dataset({'x': ('y', [4, 5, 6])}) + ds['x'] = DataArray([4, 5, 6, 7], coords=[('y', [0, 1, 2, 3])]) + expected = Dataset({'x': ('y', [4, 5, 6])}, {'y': range(3)}) self.assertDatasetIdentical(ds, expected) def test_setitem_align_new_indexes(self): ds = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) ds['bar'] = DataArray([2, 3, 4], [('x', [1, 2, 3])]) expected = Dataset({'foo': ('x', [1, 2, 3]), - 'bar': ('x', [np.nan, 2, 3])}) + 'bar': ('x', [np.nan, 2, 3])}, + {'x': [0, 1, 2]}) self.assertDatasetIdentical(ds, expected) def test_assign(self): @@ -1644,11 +1687,11 @@ def test_assign(self): self.assertDatasetIdentical(ds, Dataset()) actual = actual.assign(y = lambda ds: ds.x ** 2) - expected = Dataset({'y': ('x', [0, 1, 4])}) + expected = Dataset({'y': ('x', [0, 1, 4]), 'x': [0, 1, 2]}) self.assertDatasetIdentical(actual, expected) actual = actual.assign_coords(z = 2) - expected = Dataset({'y': ('x', [0, 1, 4])}, {'z': 2}) + expected = Dataset({'y': ('x', [0, 1, 4])}, {'z': 2, 'x': [0, 1, 2]}) self.assertDatasetIdentical(actual, expected) ds = Dataset({'a': ('x', range(3))}, {'b': ('x', ['A'] * 2 + ['B'])}) @@ -1673,8 +1716,8 @@ def test_assign_multiindex_level(self): def test_setitem_original_non_unique_index(self): # regression test for GH943 original = Dataset({'data': ('x', np.arange(5))}, - coords={'x': [0, 1, 2, 0, 1]}) - expected = Dataset({'data': ('x', np.arange(5))}) + coords={'x': [0, 1, 2, 0, 1]}) + expected = Dataset({'data': ('x', np.arange(5))}, {'x': range(5)}) actual = original.copy() actual['x'] = list(range(5)) @@ -1710,10 +1753,9 @@ def test_delitem(self): self.assertItemsEqual(data, all_items) del data['var1'] self.assertItemsEqual(data, all_items - set(['var1'])) - del data['dim1'] - self.assertItemsEqual(data, set(['time', 'dim2', 'dim3', 'numbers'])) - self.assertNotIn('dim1', data.dims) - self.assertNotIn('dim1', data.coords) + del data['numbers'] + self.assertItemsEqual(data, all_items - set(['var1', 'numbers'])) + self.assertNotIn('numbers', data.coords) def test_squeeze(self): data = Dataset({'foo': (['x', 'y', 'z'], [[[1], [2]]])}) @@ -1731,7 +1773,8 @@ def get_args(v): def test_groupby(self): data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}, {'x': ('x', list('abc')), - 'c': ('x', [0, 1, 0])}) + 'c': ('x', [0, 1, 0]), + 'y': range(5)}) groupby = data.groupby('x') self.assertEqual(len(groupby), 3) expected_groups = {'a': 0, 'b': 1, 'c': 2} @@ -1769,11 +1812,11 @@ def test_groupby_iter(self): def test_groupby_errors(self): data = create_test_data() - with self.assertRaisesRegexp(ValueError, 'must have a name'): + with self.assertRaisesRegexp(TypeError, '`group` must be'): data.groupby(np.arange(10)) with self.assertRaisesRegexp(ValueError, 'length does not match'): data.groupby(data['dim1'][:3]) - with self.assertRaisesRegexp(ValueError, "must have a 'dims'"): + with self.assertRaisesRegexp(TypeError, "`group` must be"): data.groupby(data.coords['dim1'].to_index()) def test_groupby_reduce(self): @@ -1802,6 +1845,7 @@ def test_groupby_math(self): reorder_dims = lambda x: x.transpose('dim1', 'dim2', 'dim3', 'time') ds = create_test_data() + ds['dim1'] = ds['dim1'] for squeeze in [True, False]: grouped = ds.groupby('dim1', squeeze=squeeze) @@ -1939,7 +1983,7 @@ def test_to_array(self): ds = Dataset(OrderedDict([('a', 1), ('b', ('x', [1, 2, 3]))]), coords={'c': 42}, attrs={'Conventions': 'None'}) data = [[1, 1, 1], [1, 2, 3]] - coords = {'x': range(3), 'c': 42, 'variable': ['a', 'b']} + coords = {'c': 42, 'variable': ['a', 'b']} dims = ('variable', 'x') expected = DataArray(data, coords, dims, attrs=ds.attrs) actual = ds.to_array() @@ -1981,23 +2025,24 @@ def test_to_and_from_dataframe(self): self.assertTrue(expected.equals(actual)) # check roundtrip - self.assertDatasetIdentical(ds, Dataset.from_dataframe(actual)) + self.assertDatasetIdentical(ds.assign_coords(x=[0, 1]), + Dataset.from_dataframe(actual)) # check pathological cases df = pd.DataFrame([1]) actual = Dataset.from_dataframe(df) - expected = Dataset({0: ('index', [1])}) + expected = Dataset({0: ('index', [1])}, {'index': [0]}) self.assertDatasetIdentical(expected, actual) df = pd.DataFrame() actual = Dataset.from_dataframe(df) - expected = Dataset(coords={'index':[]}) + expected = Dataset(coords={'index': []}) self.assertDatasetIdentical(expected, actual) # GH697 df = pd.DataFrame({'A' : []}) actual = Dataset.from_dataframe(df) - expected = Dataset({'A': DataArray([], dims=('index',))}) + expected = Dataset({'A': DataArray([], dims=('index',))}, {'index': []}) self.assertDatasetIdentical(expected, actual) # regression test for GH278 @@ -2242,11 +2287,12 @@ def test_dropna(self): ds.dropna('a', how=None) def test_fillna(self): - ds = Dataset({'a': ('x', [np.nan, 1, np.nan, 3])}) + ds = Dataset({'a': ('x', [np.nan, 1, np.nan, 3])}, + {'x': [0, 1, 2, 3]}) # fill with -1 actual = ds.fillna(-1) - expected = Dataset({'a': ('x', [-1, 1, -1, 3])}) + expected = Dataset({'a': ('x', [-1, 1, -1, 3])}, {'x': [0, 1, 2, 3]}) self.assertDatasetIdentical(expected, actual) actual = ds.fillna({'a': -1}) @@ -2260,7 +2306,7 @@ def test_fillna(self): self.assertDatasetIdentical(expected, actual) # fill with range(4) - b = DataArray(range(4), dims='x') + b = DataArray(range(4), coords=[('x', range(4))]) actual = ds.fillna(b) expected = b.rename('a').to_dataset() self.assertDatasetIdentical(expected, actual) @@ -2277,7 +2323,8 @@ def test_fillna(self): # okay to only include some data variables ds['b'] = np.nan actual = ds.fillna({'a': -1}) - expected = Dataset({'a': ('x', [-1, 1, -1, 3]), 'b': np.nan}) + expected = Dataset({'a': ('x', [-1, 1, -1, 3]), 'b': np.nan}, + {'x': [0, 1, 2, 3]}) self.assertDatasetIdentical(expected, actual) # but new data variables is not okay @@ -2293,7 +2340,7 @@ def test_fillna(self): self.assertDatasetIdentical(expected, result) # groupby - expected = Dataset({'a': ('x', range(4))}) + expected = Dataset({'a': ('x', range(4))}, {'x': [0, 1, 2, 3]}) for target in [ds, expected]: target.coords['b'] = ('x', [0, 0, 1, 1]) actual = ds.groupby('b').fillna(DataArray([0, 2], dims='b')) @@ -2310,12 +2357,12 @@ def test_fillna(self): self.assertEqual(actual.a.name, 'a') self.assertEqual(actual.a.attrs, ds.a.attrs) - da = DataArray(range(5), name='a', attrs={'attr':'da'}) + da = DataArray(range(5), name='a', attrs={'attr': 'da'}) actual = da.fillna(1) self.assertEqual(actual.name, 'a') self.assertEqual(actual.attrs, da.attrs) - ds = Dataset({'a': da}, attrs={'attr':'ds'}) + ds = Dataset({'a': da}, attrs={'attr': 'ds'}) actual = ds.fillna({'a': 1}) self.assertEqual(actual.attrs, ds.attrs) self.assertEqual(actual.a.name, 'a') @@ -2710,14 +2757,13 @@ def test_dataset_dataset_math(self): def test_dataset_math_auto_align(self): ds = self.make_example_math_dataset() - subset = ds.isel(x=slice(2), y=[1, 3]) + subset = ds.isel(y=[1, 3]) expected = 2 * subset actual = ds + subset self.assertDatasetIdentical(expected, actual) - - actual = ds.isel(x=slice(1)) + ds.isel(x=slice(1, None)) - expected = ds.drop(ds.x, dim='x') + actual = ds.isel(y=slice(1)) + ds.isel(y=slice(1, None)) + expected = 2 * ds.drop(ds.y, dim='y') self.assertDatasetEqual(actual, expected) actual = ds + ds[['bar']] @@ -2733,12 +2779,11 @@ def test_dataset_math_auto_align(self): # maybe unary arithmetic with empty datasets should raise instead? self.assertDatasetIdentical(Dataset() + 1, Dataset()) - for other in [ds.isel(x=slice(2)), ds.bar.isel(x=slice(0))]: - actual = ds.copy(deep=True) - other = ds.isel(x=slice(2)) - actual += other - expected = ds + other.reindex_like(ds) - self.assertDatasetIdentical(expected, actual) + actual = ds.copy(deep=True) + other = ds.isel(y=slice(2)) + actual += other + expected = ds + other.reindex_like(ds) + self.assertDatasetIdentical(expected, actual) def test_dataset_math_errors(self): ds = self.make_example_math_dataset() @@ -2803,14 +2848,16 @@ def test_dataset_diff_n1_simple(self): ds = Dataset({'foo': ('x', [5, 5, 6, 6])}) actual = ds.diff('x') expected = Dataset({'foo': ('x', [0, 1, 0])}) - expected.coords['x'].values = [1, 2, 3] self.assertDatasetEqual(expected, actual) - def test_dataset_diff_n1_lower(self): - ds = Dataset({'foo': ('x', [5, 5, 6, 6])}) + def test_dataset_diff_n1_label(self): + ds = Dataset({'foo': ('x', [5, 5, 6, 6])}, {'x': [0, 1, 2, 3]}) actual = ds.diff('x', label='lower') - expected = Dataset({'foo': ('x', [0, 1, 0])}) - expected.coords['x'].values = [0, 1, 2] + expected = Dataset({'foo': ('x', [0, 1, 0])}, {'x': [0, 1, 2]}) + self.assertDatasetEqual(expected, actual) + + actual = ds.diff('x', label='upper') + expected = Dataset({'foo': ('x', [0, 1, 0])}, {'x': [1, 2, 3]}) self.assertDatasetEqual(expected, actual) def test_dataset_diff_n1(self): @@ -2818,12 +2865,10 @@ def test_dataset_diff_n1(self): actual = ds.diff('dim2') expected = dict() expected['var1'] = DataArray(np.diff(ds['var1'].values, axis=1), - [ds['dim1'].values, - ds['dim2'].values[1:]], + {'dim2': ds['dim2'].values[1:]}, ['dim1', 'dim2']) expected['var2'] = DataArray(np.diff(ds['var2'].values, axis=1), - [ds['dim1'].values, - ds['dim2'].values[1:]], + {'dim2': ds['dim2'].values[1:]}, ['dim1', 'dim2']) expected['var3'] = ds['var3'] expected = Dataset(expected, coords={'time': ds['time'].values}) @@ -2835,12 +2880,10 @@ def test_dataset_diff_n2(self): actual = ds.diff('dim2', n=2) expected = dict() expected['var1'] = DataArray(np.diff(ds['var1'].values, axis=1, n=2), - [ds['dim1'].values, - ds['dim2'].values[2:]], + {'dim2': ds['dim2'].values[2:]}, ['dim1', 'dim2']) expected['var2'] = DataArray(np.diff(ds['var2'].values, axis=1, n=2), - [ds['dim1'].values, - ds['dim2'].values[2:]], + {'dim2': ds['dim2'].values[2:]}, ['dim1', 'dim2']) expected['var3'] = ds['var3'] expected = Dataset(expected, coords={'time': ds['time'].values}) @@ -2979,7 +3022,7 @@ def data_set(seed=None): def test_dir_expected_attrs(data_set): some_expected_attrs = {'pipe', 'mean', 'isnull', 'var1', - 'dim1', 'numbers'} + 'dim2', 'numbers'} result = dir(data_set) assert set(result) >= some_expected_attrs diff --git a/xarray/test/test_groupby.py b/xarray/test/test_groupby.py index 2ba2ed5f7c1..f4b96af74ef 100644 --- a/xarray/test/test_groupby.py +++ b/xarray/test/test_groupby.py @@ -46,4 +46,11 @@ def test_multi_index_groupby_sum(): assert expected.equals(actual) +def test_groupby_duplicate_coordinate_labels(): + # fix for http://stackoverflow.com/questions/38065129 + array = xr.DataArray([1, 2, 3], [('x', [1, 1, 2])]) + expected = xr.DataArray([3, 3], [('x', [1, 2])]) + actual = array.groupby('x').sum() + assert expected.equals(actual) + # TODO: move other groupby tests from test_dataset and test_dataarray over here diff --git a/xarray/test/test_merge.py b/xarray/test/test_merge.py index 11a6213640d..c612e61fe35 100644 --- a/xarray/test/test_merge.py +++ b/xarray/test/test_merge.py @@ -64,9 +64,9 @@ def test_merge_error(self): xr.merge([ds, ds + 1]) def test_merge_no_conflicts_single_var(self): - ds1 = xr.Dataset({'a': ('x', [1, 2])}) + ds1 = xr.Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}) - expected = xr.Dataset({'a': ('x', [1, 2, 3])}) + expected = xr.Dataset({'a': ('x', [1, 2, 3]), 'x': [0, 1, 2]}) assert expected.identical(xr.merge([ds1, ds2], compat='no_conflicts')) assert expected.identical(xr.merge([ds2, ds1], @@ -193,10 +193,11 @@ def test_merge_compat(self): ds1.merge(ds2, compat='foobar') def test_merge_auto_align(self): - ds1 = xr.Dataset({'a': ('x', [1, 2])}) + ds1 = xr.Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) ds2 = xr.Dataset({'b': ('x', [3, 4]), 'x': [1, 2]}) expected = xr.Dataset({'a': ('x', [1, 2, np.nan]), - 'b': ('x', [np.nan, 3, 4])}) + 'b': ('x', [np.nan, 3, 4])}, + {'x': [0, 1, 2]}) assert expected.identical(ds1.merge(ds2)) assert expected.identical(ds2.merge(ds1)) @@ -209,9 +210,9 @@ def test_merge_auto_align(self): assert expected.identical(ds2.merge(ds1, join='inner')) def test_merge_no_conflicts(self): - ds1 = xr.Dataset({'a': ('x', [1, 2])}) + ds1 = xr.Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}) - expected = xr.Dataset({'a': ('x', [1, 2, 3])}) + expected = xr.Dataset({'a': ('x', [1, 2, 3]), 'x': [0, 1, 2]}) assert expected.identical(ds1.merge(ds2, compat='no_conflicts')) assert expected.identical(ds2.merge(ds1, compat='no_conflicts')) diff --git a/xarray/test/test_plot.py b/xarray/test/test_plot.py index 4a3102e9aa0..c3ca9f10f58 100644 --- a/xarray/test/test_plot.py +++ b/xarray/test/test_plot.py @@ -220,7 +220,7 @@ def test_plot_nans(self): def test_x_ticks_are_rotated_for_time(self): time = pd.date_range('2000-01-01', '2000-01-10') - a = DataArray(np.arange(len(time)), {'t': time}) + a = DataArray(np.arange(len(time)), [('t', time)]) a.plot.line() rotation = plt.gca().get_xticklabels()[0].get_rotation() self.assertFalse(rotation == 0) @@ -596,8 +596,7 @@ def test_bad_x_string_exception(self): def test_coord_strings(self): # 1d coords (same as dims) - self.assertIn('x', self.darray.coords) - self.assertIn('y', self.darray.coords) + self.assertEqual({'x', 'y'}, set(self.darray.dims)) self.plotmethod(y='y', x='x') def test_non_linked_coords(self): @@ -628,6 +627,7 @@ def test_non_linked_coords_transpose(self): def test_default_title(self): a = DataArray(easy_array((4, 3, 2)), dims=['a', 'b', 'c']) + a.coords['c'] = [0, 1] a.coords['d'] = u'foo' self.plotfunc(a.isel(c=1)) title = plt.gca().get_title() From 8226d1289039be78307e4c01b078334dcaf1fcc6 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 11 Nov 2016 21:55:14 -0800 Subject: [PATCH 2/6] add issue link on optional-indexes to what's new --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3d79f856905..62fb7a66623 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,7 +22,7 @@ Breaking changes ~~~~~~~~~~~~~~~~ - Index coordinates for each dimensions are now optional, and no longer created - by default. This has a number of implications: + by default :issue:`1017`. This has a number of implications: - :py:func:`~align` and :py:meth:`~Dataset.reindex` can now error, if dimensions labels are missing and dimensions have different sizes. From 554f0078142e6e65005356334b60770ebf97bf91 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 11 Nov 2016 22:06:01 -0800 Subject: [PATCH 3/6] Fix test failure on windows --- xarray/test/test_dataarray.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 0ed22440d54..8a061c12260 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -33,7 +33,8 @@ def setUp(self): def test_repr(self): v = Variable(['time', 'x'], [[1, 2, 3], [4, 5, 6]], {'foo': 'bar'}) - coords = OrderedDict([('x', [0, 1, 2]), ('other', np.int64(0))]) + coords = OrderedDict([('x', np.arange(3, dtype=np.int64)), + ('other', np.int64(0))]) data_array = DataArray(v, coords, name='my_variable') expected = dedent("""\ From cfca6e567ab4f307f224b884d0c56dee20d58802 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 18 Nov 2016 19:02:05 -0800 Subject: [PATCH 4/6] use shared dimension summary in formatting.py --- xarray/core/formatting.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 70093ec263f..1466a9d676f 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -314,17 +314,20 @@ def indexes_repr(indexes): return u'\n'.join(summary) +def dim_summary(obj): + elements = [u'%s: %s' % (k, v) for k, v in obj.sizes.items()] + return u', '.join(elements) + + def array_repr(arr): # used for DataArray, Variable and IndexVariable if hasattr(arr, 'name') and arr.name is not None: name_str = '%r ' % arr.name else: name_str = u'' - dim_summary = u', '.join(u'%s: %s' % (k, v) for k, v - in zip(arr.dims, arr.shape)) summary = [u'' - % (type(arr).__name__, name_str, dim_summary)] + % (type(arr).__name__, name_str, dim_summary(arr))] if isinstance(getattr(arr, 'variable', arr)._data, dask_array_type): summary.append(repr(arr.data)) @@ -349,8 +352,7 @@ def dataset_repr(ds): col_width = _calculate_col_width(_get_col_items(ds)) dims_start = pretty_print(u'Dimensions:', col_width) - all_dim_strings = [u'%s: %s' % (k, v) for k, v in iteritems(ds.dims)] - summary.append(u'%s(%s)' % (dims_start, ', '.join(all_dim_strings))) + summary.append(u'%s(%s)' % (dims_start, dim_summary(ds))) summary.append(coords_repr(ds.coords, col_width=col_width)) summary.append(vars_repr(ds.data_vars, col_width=col_width)) From 4e0319df6d9268073cc4739f2e38c20ba8747440 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 8 Dec 2016 10:59:58 +0100 Subject: [PATCH 5/6] missing coordinates appear in the repr --- xarray/core/formatting.py | 19 +++++++++++++++++-- xarray/test/test_dataarray.py | 1 + xarray/test/test_dataset.py | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 1466a9d676f..491dc24a585 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -15,7 +15,8 @@ from pandas.tslib import OutOfBoundsDatetime from .options import OPTIONS -from .pycompat import PY2, iteritems, unicode_type, bytes_type, dask_array_type +from .pycompat import ( + PY2, unicode_type, bytes_type, dask_array_type, OrderedDict) def pretty_print(x, numchars): @@ -207,6 +208,13 @@ def _summarize_var_or_coord(name, var, col_width, show_values=True, return front_str + values_str +def _summarize_dummy_var(name, col_width, marker=u'*', values=u'-'): + """Used if there is no coordinate for a dimension.""" + first_col = pretty_print(u' %s %s ' % (marker, name), col_width) + dims_str = u'(%s) ' % unicode_type(name) + return u'%s%s%s' % (first_col, dims_str, values) + + def _summarize_coord_multiindex(coord, col_width, marker): first_col = pretty_print(u' %s %s ' % (marker, coord.name), col_width) return u'%s(%s) MultiIndex' % (first_col, unicode_type(coord.dims[0])) @@ -237,6 +245,8 @@ def summarize_var(name, var, col_width): def summarize_coord(name, var, col_width): + if var is None: + return _summarize_dummy_var(name, col_width) is_index = name in var.dims show_values = is_index or _not_remote(var) marker = u'*' if is_index else u' ' @@ -303,7 +313,12 @@ def _mapping_repr(mapping, title, summarizer, col_width=None): def coords_repr(coords, col_width=None): if col_width is None: col_width = _calculate_col_width(_get_col_items(coords)) - return _mapping_repr(coords, title=u'Coordinates', + # augment coordinates to include markers for missing coordinates + augmented_coords = OrderedDict(coords) + for dim in coords.dims: + if dim not in augmented_coords: + augmented_coords[dim] = None + return _mapping_repr(augmented_coords, title=u'Coordinates', summarizer=summarize_coord, col_width=col_width) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 8a061c12260..3ff79853efe 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -43,6 +43,7 @@ def test_repr(self): Coordinates: * x (x) int64 0 1 2 other int64 0 + * time (time) - Attributes: foo: bar""") self.assertEqual(expected, repr(data_array)) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index cb985123b4b..5a11f96bd51 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -76,6 +76,7 @@ def test_repr(self): * dim2 (dim2) float64 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) %s 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' numbers (dim3) int64 0 1 2 0 0 1 1 2 2 3 + * dim1 (dim1) - Data variables: var1 (dim1, dim2) float64 -1.086 0.9973 0.283 -1.506 -0.5786 1.651 ... var2 (dim1, dim2) float64 1.162 -1.097 -2.123 1.04 -0.4034 -0.126 ... From fd4ae6cc8741b35ed6342d5872e8cf6cdd5a386b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 14 Dec 2016 17:57:22 -0800 Subject: [PATCH 6/6] Mark missing coords with "o" in the repr --- xarray/core/formatting.py | 2 +- xarray/test/test_dataarray.py | 2 +- xarray/test/test_dataset.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 491dc24a585..cdcc4b4e1a5 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -208,7 +208,7 @@ def _summarize_var_or_coord(name, var, col_width, show_values=True, return front_str + values_str -def _summarize_dummy_var(name, col_width, marker=u'*', values=u'-'): +def _summarize_dummy_var(name, col_width, marker=u'o', values=u'-'): """Used if there is no coordinate for a dimension.""" first_col = pretty_print(u' %s %s ' % (marker, name), col_width) dims_str = u'(%s) ' % unicode_type(name) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index a4a1f093738..85b3cefd3a2 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -44,7 +44,7 @@ def test_repr(self): Coordinates: * x (x) int64 0 1 2 other int64 0 - * time (time) - + o time (time) - Attributes: foo: bar""") self.assertEqual(expected, repr(data_array)) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 98b3de36234..271fcd2c377 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -90,7 +90,7 @@ def test_repr(self): * dim2 (dim2) float64 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) %s 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' numbers (dim3) int64 0 1 2 0 0 1 1 2 2 3 - * dim1 (dim1) - + o dim1 (dim1) - Data variables: var1 (dim1, dim2) float64 -1.086 0.9973 0.283 -1.506 -0.5786 1.651 ... var2 (dim1, dim2) float64 1.162 -1.097 -2.123 1.04 -0.4034 -0.126 ...