diff --git a/doc/api.rst b/doc/api.rst index f0125792a2c..03af7bb46a8 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -5,7 +5,7 @@ API reference ############# This page provides an auto-generated summary of xray's API. For more details -and examples, refer to the relevant chapter in the main part of the +and examples, refer to the relevant chapters in the main part of the documentation. Top-level functions @@ -110,10 +110,7 @@ Computation Dataset.reduce Dataset.groupby Dataset.resample - Dataset.transpose Dataset.diff - Dataset.shift - Dataset.roll **Aggregation**: :py:attr:`~Dataset.all` @@ -155,6 +152,18 @@ Computation :py:attr:`~core.groupby.DatasetGroupBy.fillna` :py:attr:`~core.groupby.DatasetGroupBy.where` +Reshaping and reorganizing +-------------------------- + +.. autosummary:: + :toctree: generated/ + + Dataset.transpose + Dataset.stack + Dataset.unstack + Dataset.shift + Dataset.roll + DataArray ========= @@ -218,6 +227,16 @@ Indexing DataArray.reindex DataArray.reindex_like +Comparisons +----------- + +.. autosummary:: + :toctree: generated/ + + DataArray.equals + DataArray.identical + DataArray.broadcast_equals + Computation ----------- @@ -227,11 +246,8 @@ Computation DataArray.reduce DataArray.groupby DataArray.resample - DataArray.transpose DataArray.get_axis_num DataArray.diff - DataArray.shift - DataArray.roll **Aggregation**: :py:attr:`~DataArray.all` @@ -273,16 +289,18 @@ Computation :py:attr:`~core.groupby.DataArrayGroupBy.fillna` :py:attr:`~core.groupby.DataArrayGroupBy.where` -Comparisons ------------ + +Reshaping and reorganizing +-------------------------- .. autosummary:: :toctree: generated/ - DataArray.equals - DataArray.identical - DataArray.broadcast_equals - + DataArray.transpose + DataArray.stack + DataArray.unstack + DataArray.shift + DataArray.roll .. _api.ufuncs: diff --git a/doc/computation.rst b/doc/computation.rst index 69002ea5b17..6f1ed8bf79b 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -140,6 +140,13 @@ This means, for example, that you always subtract an array from its transpose: c - c.T +You can explicitly broadcast xray data structures by using the +:py:func:`~xray.broadcast` function: + + a2, b2 = xray.broadcast(a, b2) + a2 + b2 + .. _math automatic alignment: Automatic alignment @@ -223,13 +230,6 @@ Datasets support most of the same methods found on data arrays: ds.mean(dim='x') abs(ds) -:py:meth:`~xray.Dataset.transpose` can also be used to reorder dimensions on -all variables: - -.. ipython:: python - - ds.transpose('y', 'x') - Unfortunately, a limitation of the current version of numpy means that we cannot override ufuncs for datasets, because datasets cannot be written as a single array [1]_. :py:meth:`~xray.Dataset.apply` works around this @@ -256,5 +256,5 @@ Arithmetic between two datasets matches data variables of the same name: Similarly to index based alignment, the result has the intersection of all matching variables, and ``ValueError`` is raised if the result would be empty. -.. [1] When numpy 1.10 is released, we should be able to override ufuncs for +.. [1] When numpy 1.12 is released, we should be able to override ufuncs for datasets by making use of ``__numpy_ufunc__``. diff --git a/doc/data-structures.rst b/doc/data-structures.rst index 21f73cd41c8..cd4553e0ef4 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -436,8 +436,8 @@ dataset variables: ds.rename({'temperature': 'temp', 'precipitation': 'precip'}) -Finally, you can use :py:meth:`~xray.Dataset.swap_dims` to swap dimension and -non-dimension variables: +The related :py:meth:`~xray.Dataset.swap_dims` method allows you do to swap +dimension and non-dimension variables: .. ipython:: python @@ -535,48 +535,6 @@ dimension and whose the values are ``Index`` objects: ds.indexes -Converting datasets and arrays ------------------------------- - -To convert from a Dataset to a DataArray, use :py:meth:`~xray.Dataset.to_array`: - -.. ipython:: python - - arr = ds.to_array() - arr - -This method broadcasts all data variables in the dataset against each other, -then concatenates them along a new dimension into a new array while preserving -coordinates. - -To convert back from a DataArray to a Dataset, use -:py:meth:`~xray.DataArray.to_dataset`: - -.. ipython:: python - - arr.to_dataset(dim='variable') - -The broadcasting behavior of ``to_array`` means that the resulting array -includes the union of data variable dimensions: - -.. ipython:: python - - ds2 = xray.Dataset({'a': 0, 'b': ('x', [3, 4, 5])}) - - # the input dataset has 4 elements - ds2 - - # the resulting array has 6 elements - ds2.to_array() - -Otherwise, the result could not be represented as an orthogonal array. - -If you use ``to_dataset`` without supplying the ``dim`` argument, the DataArray will be converted into a Dataset of one variable: - -.. ipython:: python - - arr.to_dataset(name='combined') - .. [1] Latitude and longitude are 2D arrays because the dataset uses `projected coordinates`__. ``reference_time`` refers to the reference time diff --git a/doc/index.rst b/doc/index.rst index aea6d8f8f0c..0be9822be82 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -36,6 +36,7 @@ Documentation indexing computation groupby + reshaping combining time-series pandas diff --git a/doc/reshaping.rst b/doc/reshaping.rst new file mode 100644 index 00000000000..104d70a5d37 --- /dev/null +++ b/doc/reshaping.rst @@ -0,0 +1,125 @@ +.. _reshape: + +############################### +Reshaping and reorganizing data +############################### + +These methods allow you to reorganize + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xray + np.random.seed(123456) + +Reordering dimensions +--------------------- + +To reorder dimensions on a :py:class:`~xray.DataArray` or across all variables +on a :py:class:`~xray.Dataset`, use :py:meth:`xray.DataArray.transpose` or the +``.T`` property: + +.. ipython:: python + + ds = xray.Dataset({'foo': (('x', 'y', 'z'), [[[42]]]), 'bar': (('y', 'z'), [[24]])}) + ds.transpose('y', 'z', 'x') + ds.T + +Converting between datasets and arrays +-------------------------------------- + +To convert from a Dataset to a DataArray, use :py:meth:`~xray.Dataset.to_array`: + +.. ipython:: python + + arr = ds.to_array() + arr + +This method broadcasts all data variables in the dataset against each other, +then concatenates them along a new dimension into a new array while preserving +coordinates. + +To convert back from a DataArray to a Dataset, use +:py:meth:`~xray.DataArray.to_dataset`: + +.. ipython:: python + + arr.to_dataset(dim='variable') + +The broadcasting behavior of ``to_array`` means that the resulting array +includes the union of data variable dimensions: + +.. ipython:: python + + ds2 = xray.Dataset({'a': 0, 'b': ('x', [3, 4, 5])}) + + # the input dataset has 4 elements + ds2 + + # the resulting array has 6 elements + ds2.to_array() + +Otherwise, the result could not be represented as an orthogonal array. + +If you use ``to_dataset`` without supplying the ``dim`` argument, the DataArray will be converted into a Dataset of one variable: + +.. ipython:: python + + arr.to_dataset(name='combined') + +.. _reshape.stack: + +Stack and unstack +----------------- + +As part of xray's nascent support for :py:class:`pandas.MultiIndex`, we have +implemented :py:meth:`~xray.DataArray.stack` and +:py:meth:`~xray.DataArray.unstack` method, for combining or splitting dimensions: + +.. ipython:: python + + array = xray.DataArray(np.random.randn(2, 3), + coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) + stacked = array.stack(z=('x', 'y')) + stacked + stacked.unstack('z') + +These methods are modeled on the :py:class:`pandas.DataFrame` methods of the +same name, although they in xray they always create new dimensions rather than +adding to the existing index or columns. + +Like :py:meth:`DataFrame.unstack`, xray's ``unstack`` always succeeds, even +if the multi-index being unstacked does not contain all possible levels. Missing +levels are filled in with ``NaN`` in the resulting object: + +.. ipython:: python + + stacked2 = stacked[::2] + stacked2 + stacked2.unstack('z') + +However, xray's ``stack`` has an important difference from pandas: unlike +pandas, it does not automatically drop missing values. Compare: + +.. ipython:: python + + array = xray.DataArray([[np.nan, 1], [2, 3]], dims=['x', 'y']) + array.stack(z=('x', 'y')) + array.to_pandas().stack() + +We departed from pandas's behavior here because predictable shapes for new +array dimensions is necessary for :ref:`dask`. + +Shift and roll +-------------- + +To adjust coordinate labels, you can use the :py:meth:`~xray.Dataset.shift` and +:py:meth:`~xray.Dataset.roll` methods: + +.. ipython:: python + + array = xray.DataArray([1, 2, 3, 4], dims='x') + array.shift(x=2) + array.roll(x=2) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a8e97e102cd..84a537d19af 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -12,6 +12,31 @@ What's New v0.7.0 (unreleased) ------------------- +This major release includes redesign of :py:class:`~xray.DataArray` +internals, as well as new methods for reshaping, rolling and shifting +data. It includes preliminary support for :py:class:`pandas.MultiIndex`, +as well as a number of other features and bug fixes, several of which +offer improved compatibility with pandas. + +New name +~~~~~~~~ + +The project formerly known as "xray" is now "xarray"! This avoids a namespace +conflict with the entirety of x-ray science. Renaming our project seemed like +the right thing to do, especially because some scientists who work with actual +x-rays are interested in using this project in their work. Thanks for your +understanding and patience in this transition. You can now find our +documentation and code repository at new URLs: + +- http://xarray.pydata.org +- http://github.com/pydata/xarray/ + +To ease the transition, we have simultaneously released v0.7.0 of both +``xray`` and ``xarray`` on the Python Package Index. These packages are +identical, except the former issues a deprecation warning when imported. This +will be the last xray release. We recommend switching your imports going +forward to ``import xarray as xr``. + .. _v0.7.0.breaking: Breaking changes @@ -61,26 +86,61 @@ Breaking changes .. _this stackoverflow report: http://stackoverflow.com/questions/33158558/python-xray-extract-first-and-last-time-value-within-each-month-of-a-timeseries -Bug fixes -~~~~~~~~~ +Enhancements +~~~~~~~~~~~~ -- Fixes for several issues found on ``DataArray`` objects with the same name - as one of their coordinates (see :ref:`v0.7.0.breaking` for more details). -- Attempting to assign a ``Dataset`` or ``DataArray`` variable/attribute using - attribute-style syntax (e.g., ``ds.foo = 42``) now raises an error rather - than silently failing (:issue:`656`, :issue:`714`). +- Basic support for :py:class:`~pandas.MultiIndex` coordinates on xray objects, including + indexing, :py:meth:`~DataArray.stack` and :py:meth:`~DataArray.unstack`: -- ``DataArray.to_masked_array`` always returns masked array with mask being an array -(not a scalar value) (:issue:`684`) -- You can now pass pandas objects with non-numpy dtypes (e.g., ``categorical`` - or ``datetime64`` with a timezone) into xray without an error - (:issue:`716`). + .. ipython:: + :verbatim: -v0.6.2 (unreleased) -------------------- + In [7]: df = pd.DataFrame({'foo': range(3), + ...: 'x': ['a', 'b', 'b'], + ...: 'y': [0, 0, 1]}) -Enhancements -~~~~~~~~~~~~ + In [8]: s = df.set_index(['x', 'y'])['foo'] + + In [12]: arr = xray.DataArray(s, dims='z') + + In [13]: arr + Out[13]: + + array([0, 1, 2]) + Coordinates: + * z (z) object ('a', 0) ('b', 0) ('b', 1) + + In [19]: arr.indexes['z'] + Out[19]: + MultiIndex(levels=[[u'a', u'b'], [0, 1]], + labels=[[0, 1, 1], [0, 0, 1]], + names=[u'x', u'y']) + + In [14]: arr.unstack('z') + Out[14]: + + array([[ 0., nan], + [ 1., 2.]]) + Coordinates: + * x (x) object 'a' 'b' + * y (y) int64 0 1 + + In [26]: arr.unstack('z').stack(z=('x', 'y')) + Out[26]: + + array([ 0., nan, 1., 2.]) + Coordinates: + * z (z) object ('a', 0) ('a', 1) ('b', 0) ('b', 1) + + See :ref:`reshape.stack` for more details. + + .. warning:: + + xray's MultiIndex support is still experimental, and we have a long to- + do list of desired additions (:issue:`719`), including better display of + multi-index levels when printing a ``Dataset``, and support for saving + datasets with a MultiIndex to a netCDF file. User contributions in this + area would be greatly appreciated. - Support for reading GRIB, HDF4 and other file formats via PyNIO_. See :ref:`io.pynio` for more details. @@ -100,10 +160,10 @@ Enhancements Notice that ``shift`` moves data independently of coordinates, but ``roll`` moves both data and coordinates. -- Assigning a ``pandas`` object to the variable of ``Dataset`` directly is now permitted. Its - index names correspond to the ``dims`` of the ``Dataset``, and its data is aligned +- Assigning a ``pandas`` object directly as a ``Dataset`` variable is now permitted. Its + index names correspond to the ``dims`` of the ``Dataset``, and its data is aligned. - Passing a :py:class:`pandas.DataFrame` or :py:class:`pandas.Panel` to a Dataset constructor - is now permitted + is now permitted. - New function :py:func:`~xray.broadcast` for explicitly broadcasting ``DataArray`` and ``Dataset`` objects against each other. For example: @@ -120,7 +180,32 @@ Enhancements Bug fixes ~~~~~~~~~ +- Fixes for several issues found on ``DataArray`` objects with the same name + as one of their coordinates (see :ref:`v0.7.0.breaking` for more details). +- ``DataArray.to_masked_array`` always returns masked array with mask being an + array (not a scalar value) (:issue:`684`) - Allows for (imperfect) repr of Coords when underlying index is PeriodIndex (:issue:`645`). +- Fixes for several issues found on ``DataArray`` objects with the same name + as one of their coordinates (see :ref:`v0.7.0.breaking` for more details). +- Attempting to assign a ``Dataset`` or ``DataArray`` variable/attribute using + attribute-style syntax (e.g., ``ds.foo = 42``) now raises an error rather + than silently failing (:issue:`656`, :issue:`714`). +- You can now pass pandas objects with non-numpy dtypes (e.g., ``categorical`` + or ``datetime64`` with a timezone) into xray without an error + (:issue:`716`). + +Acknowledgments +~~~~~~~~~~~~~~~ + +The following individuals contributed to this release: + +- Antony Lee +- Fabien Maussion +- Joe Hamman +- Maximilian Roos +- Stephan Hoyer +- Takeshi Kanmae +- femtotrader v0.6.1 (21 October 2015) ------------------------ diff --git a/xray/backends/api.py b/xray/backends/api.py index 9bf4cf05b98..4ce4d1405d4 100644 --- a/xray/backends/api.py +++ b/xray/backends/api.py @@ -170,9 +170,8 @@ def maybe_decode_store(store, lock=False): else: file_arg = filename_or_obj token = tokenize(file_arg, group, decode_cf, mask_and_scale, - decode_times, concat_characters, - decode_coords, engine, chunks, lock, - drop_variables) + decode_times, concat_characters, decode_coords, + engine, chunks, drop_variables) name_prefix = '%s:%s/' % (filename_or_obj, group or '') ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token, lock=lock) diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py index f78663842bb..95380f80811 100644 --- a/xray/core/dataarray.py +++ b/xray/core/dataarray.py @@ -763,6 +763,73 @@ def swap_dims(self, dims_dict): ds = self._to_temp_dataset().swap_dims(dims_dict) return self._from_temp_dataset(ds) + def stack(self, **dimensions): + """ + Stack any number of existing dimensions into a single new dimension. + + New dimensions will be added at the end, and the corresponding + coordinate variables will be combined into a MultiIndex. + + Parameters + ---------- + **dimensions : keyword arguments of the form new_name=(dim1, dim2, ...) + Names of new dimensions, and the existing dimensions that they + replace. + + Returns + ------- + stacked : DataArray + DataArray with stacked data. + + Example + ------- + + >>> arr = DataArray(np.arange(6).reshape(2, 3), + ... coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) + >>> arr + + array([[0, 1, 2], + [3, 4, 5]]) + Coordinates: + * x (x) |S1 'a' 'b' + * y (y) int64 0 1 2 + >>> stacked = arr.stack(z=('x', 'y')) + >>> stacked.indexes['z'] + MultiIndex(levels=[[u'a', u'b'], [0, 1, 2]], + labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + names=[u'x', u'y']) + + See also + -------- + DataArray.unstack + """ + ds = self._to_temp_dataset().stack(**dimensions) + return self._from_temp_dataset(ds) + + def unstack(self, dim): + """ + Unstack an existing dimension corresponding to a MultiIndex into + multiple new dimensions. + + New dimensions will be added at the end. + + Parameters + ---------- + dim : str + Name of the existing dimension to unstack. + + Returns + ------- + unstacked : DataArray + Array with unstacked data. + + See also + -------- + DataArray.stack + """ + ds = self._to_temp_dataset().unstack(dim) + return self._from_temp_dataset(ds) + def transpose(self, *dims): """Return a new DataArray object with transposed dimensions. diff --git a/xray/core/dataset.py b/xray/core/dataset.py index c974626d3f5..90f89f14fee 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -1308,6 +1308,109 @@ def swap_dims(self, dims_dict, inplace=False): return self._replace_vars_and_dims(variables, coord_names, inplace=inplace) + def _stack_once(self, dims, new_dim): + variables = OrderedDict() + for name, var in self.variables.items(): + if name not in dims: + if any(d in var.dims for d in dims): + add_dims = [d for d in dims if d not in var.dims] + vdims = list(var.dims) + add_dims + shape = [self.dims[d] for d in vdims] + exp_var = var.expand_dims(vdims, shape) + stacked_var = exp_var.stack(**{new_dim: dims}) + variables[name] = stacked_var + else: + variables[name] = var.copy(deep=False) + + idx = pd.MultiIndex.from_product([self.indexes[d] for d in dims], + names=dims) + variables[new_dim] = Coordinate(new_dim, idx) + + coord_names = set(self._coord_names) - set(dims) | set([new_dim]) + + return self._replace_vars_and_dims(variables, coord_names) + + def stack(self, **dimensions): + """ + Stack any number of existing dimensions into a single new dimension. + + New dimensions will be added at the end, and the corresponding + coordinate variables will be combined into a MultiIndex. + + Parameters + ---------- + **dimensions : keyword arguments of the form new_name=(dim1, dim2, ...) + Names of new dimensions, and the existing dimensions that they + replace. + + Returns + ------- + stacked : Dataset + Dataset with stacked data. + + See also + -------- + Dataset.unstack + """ + result = self + for new_dim, dims in dimensions.items(): + result = result._stack_once(dims, new_dim) + return result + + def unstack(self, dim): + """ + Unstack an existing dimension corresponding to a MultiIndex into + multiple new dimensions. + + New dimensions will be added at the end. + + Parameters + ---------- + dim : str + Name of the existing dimension to unstack. + + Returns + ------- + unstacked : Dataset + Dataset with unstacked data. + + See also + -------- + Dataset.stack + """ + if dim not in self.dims: + raise ValueError('invalid dimension: %s' % dim) + + index = self.indexes[dim] + if not isinstance(index, pd.MultiIndex): + raise ValueError('cannot unstack a dimension that does not have ' + 'a MultiIndex') + + full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) + obj = self.reindex(copy=False, **{dim: full_idx}) + + new_dim_names = index.names + if any(name is None for name in new_dim_names): + raise ValueError('cannot unstack dimension with unnamed levels') + + new_dim_sizes = [lev.size for lev in index.levels] + + variables = OrderedDict() + for name, var in obj.variables.items(): + if name != dim: + if dim in var.dims: + new_dims = OrderedDict(zip(new_dim_names, new_dim_sizes)) + variables[name] = var.unstack(**{dim: new_dims}) + else: + variables[name] = var + + for name, lev in zip(new_dim_names, index.levels): + variables[name] = Coordinate(name, lev) + + coord_names = set(self._coord_names) - set([dim]) | set(new_dim_names) + + return self._replace_vars_and_dims(variables, coord_names) + def update(self, other, inplace=True): """Update this dataset's variables with those from another dataset. diff --git a/xray/core/indexing.py b/xray/core/indexing.py index 277428966a5..2dffff2910c 100644 --- a/xray/core/indexing.py +++ b/xray/core/indexing.py @@ -117,6 +117,24 @@ def _try_get_item(x): return x +def _asarray_tuplesafe(values): + """ + Convert values into a numpy array of at most 1-dimension, while preserving + tuples. + + Adapted from pandas.core.common._asarray_tuplesafe + """ + if isinstance(values, tuple): + result = utils.tuple_to_0darray(values) + else: + result = np.asarray(values) + if result.ndim == 2: + result = np.empty(len(values), dtype=object) + result[:] = values + + return result + + def convert_label_indexer(index, label, index_name='', method=None, tolerance=None): """Given a pandas.Index (or xray.Coordinate) and labels (e.g., from @@ -149,9 +167,9 @@ def convert_label_indexer(index, label, index_name='', method=None, 'dimension %r with a slice over integer positions; ' 'the index is unsorted or non-unique') else: - label = np.asarray(label) + label = _asarray_tuplesafe(label) if label.ndim == 0: - indexer = index.get_loc(np.asscalar(label), **kwargs) + indexer = index.get_loc(label.item(), **kwargs) elif label.dtype.kind == 'b': indexer, = np.nonzero(label) else: @@ -368,10 +386,10 @@ class PandasIndexAdapter(utils.NDArrayMixin): def __init__(self, array, dtype=None): self.array = utils.safe_cast_to_index(array) if dtype is None: - # if a PeriodIndex, force an object dtype if isinstance(array, pd.PeriodIndex): dtype = np.dtype('O') elif hasattr(array, 'categories'): + # category isn't a real numpy dtype dtype = array.categories.dtype elif not utils.is_valid_numpy_dtype(array.dtype): dtype = np.dtype('O') @@ -391,7 +409,12 @@ def __array__(self, dtype=None): with suppress(AttributeError): # this might not be public API array = array.asobject - return np.asarray(array, dtype) + return np.asarray(array.values, dtype=dtype) + + @property + def shape(self): + # .shape is broken on pandas prior to v0.15.2 + return (len(self.array),) def __getitem__(self, key): if isinstance(key, tuple) and len(key) == 1: @@ -399,22 +422,22 @@ def __getitem__(self, key): # objects don't like tuples) key, = key - if isinstance(key, (int, np.integer)): - value = self.array[key] - if value is pd.NaT: - # work around the impossibility of casting NaT with asarray - # note: it probably would be better in general to return - # pd.Timestamp rather np.than datetime64 but this is easier - # (for now) - value = np.datetime64('NaT', 'ns') - elif isinstance(value, timedelta): - value = np.timedelta64(getattr(value, 'value', value), 'ns') - else: - value = np.asarray(value, dtype=self.dtype) - else: - value = PandasIndexAdapter(self.array[key], dtype=self.dtype) - - return value + result = self.array[key] + + if isinstance(result, pd.Index): + result = PandasIndexAdapter(result, dtype=self.dtype) + elif result is pd.NaT: + # work around the impossibility of casting NaT with asarray + # note: it probably would be better in general to return + # pd.Timestamp rather np.than datetime64 but this is easier + # (for now) + result = np.datetime64('NaT', 'ns') + elif isinstance(result, timedelta): + result = np.timedelta64(getattr(result, 'value', result), 'ns') + elif self.dtype != object: + result = np.asarray(result, dtype=self.dtype) + + return result def __repr__(self): return ('%s(array=%r, dtype=%r)' diff --git a/xray/core/utils.py b/xray/core/utils.py index 660a69af211..2ea84da9e02 100644 --- a/xray/core/utils.py +++ b/xray/core/utils.py @@ -178,6 +178,13 @@ def is_valid_numpy_dtype(dtype): return True +def tuple_to_0darray(value): + result = np.empty((1,), dtype=object) + result[:] = [value] + result.shape = () + return result + + def dict_equiv(first, second, compat=equivalent): """Test equivalence of two dict-like objects. If any of the values are numpy arrays, compare them correctly. diff --git a/xray/core/variable.py b/xray/core/variable.py index 405bb906d52..c1c69efe9e6 100644 --- a/xray/core/variable.py +++ b/xray/core/variable.py @@ -105,15 +105,15 @@ def as_compatible_data(data, fastpath=False): return data if isinstance(data, pd.Index): - if isinstance(data, pd.MultiIndex): - raise NotImplementedError( - 'no support yet for using a pandas.MultiIndex in an ' - 'xray.Coordinate') return _maybe_wrap_data(data) + if isinstance(data, tuple): + data = utils.tuple_to_0darray(data) + if isinstance(data, pd.Timestamp): # TODO: convert, handle datetime objects, too data = np.datetime64(data.value, 'ns') + if isinstance(data, timedelta): data = np.timedelta64(getattr(data, 'value', data), 'ns') @@ -250,7 +250,7 @@ def data(self, data): self._data = data def _data_cached(self): - if not isinstance(self._data, np.ndarray): + if not isinstance(self._data, (np.ndarray, PandasIndexAdapter)): self._data = np.asarray(self._data) return self._data @@ -720,6 +720,110 @@ def expand_dims(self, dims, shape=None): self._encoding, fastpath=True) return expanded_var.transpose(*dims) + def _stack_once(self, dims, new_dim): + if not set(dims) <= set(self.dims): + raise ValueError('invalid existing dimensions: %s' % dims) + + if new_dim in self.dims: + raise ValueError('cannot create a new dimension with the same ' + 'name as an existing dimension') + + if len(dims) == 0: + # don't stack + return self.copy(deep=False) + + other_dims = [d for d in self.dims if d not in dims] + dim_order = other_dims + list(dims) + reordered = self.transpose(*dim_order) + + new_shape = reordered.shape[:len(other_dims)] + (-1,) + new_data = reordered.data.reshape(new_shape) + new_dims = reordered.dims[:len(other_dims)] + (new_dim,) + + return Variable(new_dims, new_data, self._attrs, self._encoding, + fastpath=True) + + def stack(self, **dimensions): + """ + Stack any number of existing dimensions into a single new dimension. + + New dimensions will be added at the end, and the order of the data + along each new dimension will be in contiguous (C) order. + + Parameters + ---------- + **dimensions : keyword arguments of the form new_name=(dim1, dim2, ...) + Names of new dimensions, and the existing dimensions that they + replace. + + Returns + ------- + stacked : Variable + Variable with the same attributes but stacked data. + + See also + -------- + Variable.unstack + """ + result = self + for new_dim, dims in dimensions.items(): + result = result._stack_once(dims, new_dim) + return result + + def _unstack_once(self, dims, old_dim): + new_dim_names = tuple(dims.keys()) + new_dim_sizes = tuple(dims.values()) + + if old_dim not in self.dims: + raise ValueError('invalid existing dimension: %s' % old_dim) + + if set(new_dim_names).intersection(self.dims): + raise ValueError('cannot create a new dimension with the same ' + 'name as an existing dimension') + + axis = self.get_axis_num(old_dim) + if np.prod(new_dim_sizes) != self.shape[axis]: + raise ValueError('the product of the new dimension sizes must ' + 'equal the size of the old dimension') + + other_dims = [d for d in self.dims if d != old_dim] + dim_order = other_dims + [old_dim] + reordered = self.transpose(*dim_order) + + new_shape = reordered.shape[:len(other_dims)] + new_dim_sizes + new_data = reordered.data.reshape(new_shape) + new_dims = reordered.dims[:len(other_dims)] + new_dim_names + + return Variable(new_dims, new_data, self._attrs, self._encoding, + fastpath=True) + + def unstack(self, **dimensions): + """ + Unstack an existing dimension into multiple new dimensions. + + New dimensions will be added at the end, and the order of the data + along each new dimension will be in contiguous (C) order. + + Parameters + ---------- + **dimensions : keyword arguments of the form old_dim={dim1: size1, ...} + Names of existing dimensions, and the new dimensions and sizes that they + map to. + + Returns + ------- + unstacked : Variable + Variable with the same attributes but unstacked data. + + See also + -------- + Variable.stack + """ + result = self + for old_dim, dims in dimensions.items(): + result = result._unstack_once(dims, old_dim) + return result + def fillna(self, value): return self._fillna(value) @@ -989,9 +1093,10 @@ def to_index(self): # n.b. creating a new pandas.Index from an old pandas.Index is # basically free as pandas.Index objects are immutable assert self.ndim == 1 - return pd.Index(self._data_cached().array, name=self.dims[0]) - - # pandas.Index like properties: + index = self._data_cached().array + if not isinstance(index, pd.MultiIndex): + index = index.set_names(self.name) + return index @property def name(self): @@ -1001,25 +1106,6 @@ def name(self): def name(self, value): raise AttributeError('cannot modify name of Coordinate in-place') - def get_indexer(self, label): - return self.to_index().get_indexer(label) - - def slice_indexer(self, start=None, stop=None, step=None): - return self.to_index().slice_indexer(start, stop, step) - - def slice_locs(self, start=None, stop=None): - return self.to_index().slice_locs(start, stop) - - def get_loc(self, label): - return self.to_index().get_loc(label) - - @property - def is_monotonic(self): - return self.to_index().is_monotonic - - def is_numeric(self): - return self.to_index().is_numeric() - def _unified_dims(variables): # validate dimensions diff --git a/xray/test/test_dask.py b/xray/test/test_dask.py index 7e245b233ba..215cfaa13f8 100644 --- a/xray/test/test_dask.py +++ b/xray/test/test_dask.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from xray import Variable, DataArray, Dataset, concat import xray.ufuncs as xu @@ -288,3 +289,14 @@ def counting_get(*args, **kwargs): with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1) + + def test_stack(self): + data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) + arr = DataArray(data, dims=('w', 'x', 'y')) + stacked = arr.stack(z=('x', 'y')) + z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], + names=['x', 'y']) + expected = DataArray(data.reshape(2, -1), {'w': [0, 1], 'z': z}, + dims=['w', 'z']) + assert stacked.data.chunks == expected.data.chunks + self.assertLazyAndIdentical(expected, stacked) diff --git a/xray/test/test_dataarray.py b/xray/test/test_dataarray.py index 854069ce7fb..c1135601ef8 100644 --- a/xray/test/test_dataarray.py +++ b/xray/test/test_dataarray.py @@ -215,10 +215,6 @@ def test_constructor_from_self_described(self): actual = DataArray(Coordinate('foo', ['a', 'b'])) self.assertDataArrayIdentical(expected, actual) - s = pd.Series(range(2), pd.MultiIndex.from_product([['a', 'b'], [0]])) - with self.assertRaisesRegexp(NotImplementedError, 'MultiIndex'): - DataArray(s) - def test_constructor_from_0d(self): expected = Dataset({None: ([], 0)})[None] actual = DataArray(0) @@ -481,6 +477,17 @@ def test_loc_single_boolean(self): self.assertEqual(data.loc[True], 0) self.assertEqual(data.loc[False], 1) + def test_multiindex(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + data = DataArray(range(6), [('x', idx)]) + + self.assertDataArrayIdentical(data.sel(x=('a', 0)), data.isel(x=0)) + self.assertDataArrayIdentical(data.sel(x=('c', 1)), data.isel(x=-1)) + self.assertDataArrayIdentical(data.sel(x=[('a', 0)]), data.isel(x=[0])) + self.assertDataArrayIdentical(data.sel(x=[('a', 0), ('c', 1)]), + data.isel(x=[0, -1])) + self.assertDataArrayIdentical(data.sel(x='a'), data.isel(x=slice(2))) + def test_time_components(self): dates = pd.date_range('2000-01-01', periods=10) da = DataArray(np.arange(1, 11), [('time', dates)]) @@ -856,6 +863,20 @@ def test_dataset_math(self): actual['tmin'] -= obs['tmin'] self.assertDatasetIdentical(actual, expected) + def test_stack_unstack(self): + orig = DataArray([[0, 1], [2, 3]], dims=['x', 'y'], attrs={'foo': 2}) + actual = orig.stack(z=['x', 'y']).unstack('z') + self.assertDataArrayIdentical(orig, actual) + + def test_unstack_pandas_consistency(self): + df = pd.DataFrame({'foo': range(3), + 'x': ['a', 'b', 'b'], + 'y': [0, 0, 1]}) + s = df.set_index(['x', 'y'])['foo'] + expected = DataArray(s.unstack(), name='foo') + actual = DataArray(s, dims='z').unstack('z') + self.assertDataArrayIdentical(expected, actual) + def test_transpose(self): self.assertVariableEqual(self.dv.variable.transpose(), self.dv.transpose()) @@ -1392,6 +1413,15 @@ def test_to_and_from_series(self): self.assertDataArrayIdentical(expected_da, DataArray.from_series(actual)) + def test_series_categorical_index(self): + # regression test for GH700 + if not hasattr(pd, 'CategoricalIndex'): + raise unittest.SkipTest('requires pandas with CategoricalIndex') + + s = pd.Series(range(5), index=pd.CategoricalIndex(list('aabbc'))) + arr = DataArray(s) + assert "'a'" in repr(arr) # should not error + def test_to_masked_array(self): rs = np.random.RandomState(44) x = rs.random_sample(size=(10, 20)) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 6443a774b2d..505be42a8a7 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -1133,6 +1133,57 @@ def test_swap_dims(self): with self.assertRaisesRegexp(ValueError, 'replacement dimension'): original.swap_dims({'x': 'z'}) + def test_stack(self): + ds = Dataset({'a': ('x', [0, 1]), + 'b': (('x', 'y'), [[0, 1], [2, 3]]), + 'y': ['a', 'b']}) + + exp_index = pd.MultiIndex.from_product([[0, 1], ['a', 'b']], + names=['x', 'y']) + expected = Dataset({'a': ('z', [0, 0, 1, 1]), + 'b': ('z', [0, 1, 2, 3]), + 'z': exp_index}) + actual = ds.stack(z=['x', 'y']) + self.assertDatasetIdentical(expected, actual) + + exp_index = pd.MultiIndex.from_product([['a', 'b'], [0, 1]], + names=['y', 'x']) + expected = Dataset({'a': ('z', [0, 1, 0, 1]), + 'b': ('z', [0, 2, 1, 3]), + 'z': exp_index}) + actual = ds.stack(z=['y', 'x']) + self.assertDatasetIdentical(expected, actual) + + def test_unstack(self): + index = pd.MultiIndex.from_product([[0, 1], ['a', 'b']], + names=['x', 'y']) + ds = Dataset({'b': ('z', [0, 1, 2, 3]), 'z': index}) + expected = Dataset({'b': (('x', 'y'), [[0, 1], [2, 3]]), + 'y': ['a', 'b']}) + actual = ds.unstack('z') + self.assertDatasetIdentical(actual, expected) + + def test_unstack_errors(self): + ds = Dataset({'x': [1, 2, 3]}) + with self.assertRaisesRegexp(ValueError, 'invalid dimension'): + ds.unstack('foo') + with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): + ds.unstack('x') + + ds2 = Dataset({'x': pd.Index([(0, 1)])}) + with self.assertRaisesRegexp(ValueError, 'unnamed levels'): + ds2.unstack('x') + + def test_stack_unstack(self): + ds = Dataset({'a': ('x', [0, 1]), + 'b': (('x', 'y'), [[0, 1], [2, 3]]), + 'y': ['a', 'b']}) + actual = ds.stack(z=['x', 'y']).unstack('z') + assert actual.broadcast_equals(ds) + + actual = ds[['b']].stack(z=['x', 'y']).unstack('z') + assert actual.identical(ds[['b']]) + def test_update(self): data = create_test_data(seed=0) expected = data.copy() diff --git a/xray/test/test_indexing.py b/xray/test/test_indexing.py index 51d5862d9f5..3b335204302 100644 --- a/xray/test/test_indexing.py +++ b/xray/test/test_indexing.py @@ -67,6 +67,17 @@ def test_orthogonal_indexer(self): with self.assertRaisesRegexp(ValueError, 'invalid subkey'): print(indexing.orthogonal_indexer((1.5 * y, 1.5 * y), x.shape)) + def test_asarray_tuplesafe(self): + res = indexing._asarray_tuplesafe(('a', 1)) + assert isinstance(res, np.ndarray) + assert res.ndim == 0 + assert res.item() == ('a', 1) + + res = indexing._asarray_tuplesafe([(0,), (1,)]) + assert res.shape == (2,) + assert res[0] == (0,) + assert res[1] == (1,) + def test_convert_label_indexer(self): # TODO: add tests that aren't just for edge cases index = pd.Index([1, 2, 3]) diff --git a/xray/test/test_variable.py b/xray/test/test_variable.py index 272816ca432..501376b2d3a 100644 --- a/xray/test/test_variable.py +++ b/xray/test/test_variable.py @@ -416,6 +416,12 @@ def test_pandas_datetime64_with_tz(self): # pandas is new enough that it has datetime64 with timezone dtype assert v.dtype == 'object' + def test_multiindex(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + v = self.cls('x', idx) + self.assertVariableIdentical(Variable((), ('a', 0)), v[0]) + self.assertVariableIdentical(v, v[:]) + class TestVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(Variable) @@ -754,6 +760,70 @@ def test_expand_dims(self): with self.assertRaisesRegexp(ValueError, 'must be a superset'): v.expand_dims(['z']) + def test_stack(self): + v = Variable(['x', 'y'], [[0, 1], [2, 3]], {'foo': 'bar'}) + actual = v.stack(z=('x', 'y')) + expected = Variable('z', [0, 1, 2, 3], v.attrs) + self.assertVariableIdentical(actual, expected) + + actual = v.stack(z=('x',)) + expected = Variable(('y', 'z'), v.data.T, v.attrs) + self.assertVariableIdentical(actual, expected) + + actual = v.stack(z=(),) + self.assertVariableIdentical(actual, v) + + actual = v.stack(X=('x',), Y=('y',)).transpose('X', 'Y') + expected = Variable(('X', 'Y'), v.data, v.attrs) + self.assertVariableIdentical(actual, expected) + + def test_stack_errors(self): + v = Variable(['x', 'y'], [[0, 1], [2, 3]], {'foo': 'bar'}) + + with self.assertRaisesRegexp(ValueError, 'invalid existing dim'): + v.stack(z=('x1',)) + with self.assertRaisesRegexp(ValueError, 'cannot create a new dim'): + v.stack(x=('x',)) + + def test_unstack(self): + v = Variable('z', [0, 1, 2, 3], {'foo': 'bar'}) + actual = v.unstack(z=OrderedDict([('x', 2), ('y', 2)])) + expected = Variable(('x', 'y'), [[0, 1], [2, 3]], v.attrs) + self.assertVariableIdentical(actual, expected) + + actual = v.unstack(z=OrderedDict([('x', 4), ('y', 1)])) + expected = Variable(('x', 'y'), [[0], [1], [2], [3]], v.attrs) + self.assertVariableIdentical(actual, expected) + + actual = v.unstack(z=OrderedDict([('x', 4)])) + expected = Variable('x', [0, 1, 2, 3], v.attrs) + self.assertVariableIdentical(actual, expected) + + def test_unstack_errors(self): + v = Variable('z', [0, 1, 2, 3]) + with self.assertRaisesRegexp(ValueError, 'invalid existing dim'): + v.unstack(foo={'x': 4}) + with self.assertRaisesRegexp(ValueError, 'cannot create a new dim'): + v.stack(z=('z',)) + with self.assertRaisesRegexp(ValueError, 'the product of the new dim'): + v.unstack(z={'x': 5}) + + def test_unstack_2d(self): + v = Variable(['x', 'y'], [[0, 1], [2, 3]]) + actual = v.unstack(y={'z': 2}) + expected = Variable(['x', 'z'], v.data) + self.assertVariableIdentical(actual, expected) + + actual = v.unstack(x={'z': 2}) + expected = Variable(['y', 'z'], v.data.T) + self.assertVariableIdentical(actual, expected) + + def test_stack_unstack_consistency(self): + v = Variable(['x', 'y'], [[0, 1], [2, 3]]) + actual = (v.stack(z=('x', 'y')) + .unstack(z=OrderedDict([('x', 2), ('y', 2)]))) + self.assertVariableIdentical(actual, v) + def test_broadcasting_math(self): x = np.random.randn(2, 3) v = Variable(['a', 'b'], x)