Skip to content

Rework DataArray internals #648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 4, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,64 @@ What's New
import xray
np.random.seed(123456)

v0.7.0 (unreleased)
-------------------

.. _v0.7.0.breaking:

Breaking changes
~~~~~~~~~~~~~~~~

- The internal data model used by :py:class:`~xray.DataArray` has been
rewritten to fix several outstanding issues (:issue:`367`, :issue:`634`,
`this stackoverflow report`_). Internally, ``DataArray`` is now implemented
in terms of ``._variable`` and ``._coords`` attributes instead of holding
variables in a ``Dataset`` object.

This refactor ensures that if a DataArray has the
same name as one of its coordinates, the array and the coordinate no longer
share the same data.

In practice, this means that creating a DataArray with the same ``name`` as
one of its dimensions no longer automatically uses that array to label the
corresponding coordinate. You will now need to provide coordinate labels
explicitly. Here's the old behavior:

.. ipython::
:verbatim:

In [2]: xray.DataArray([4, 5, 6], dims='x', name='x')
Out[2]:
<xray.DataArray 'x' (x: 3)>
array([4, 5, 6])
Coordinates:
* x (x) int64 4 5 6

and the new behavior (compare the values of the ``x`` coordinate):

.. ipython::
:verbatim:

In [2]: xray.DataArray([4, 5, 6], dims='x', name='x')
Out[2]:
<xray.DataArray 'x' (x: 3)>
array([4, 5, 6])
Coordinates:
* x (x) int64 0 1 2

- It is no longer possible to convert a DataArray to a Dataset with
:py:meth:`xray.DataArray.to_dataset` if it is unnamed. This will now
raise ``ValueError``. If the array is unnamed, you need to supply the
``name`` argument.

.. _this stackoverflow report: http://stackoverflow.com/questions/33158558/python-xray-extract-first-and-last-time-value-within-each-month-of-a-timeseries

Bug fixes
~~~~~~~~~

- Fixes for several issues found on ``DataArray`` objects with the same name
as one of their coordinates (see :ref:`v0.7.0.breaking` for more details).

v0.6.2 (unreleased)
-------------------

Expand Down
10 changes: 10 additions & 0 deletions xray/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ def partial_align(*objects, **kwargs):
return tuple(obj.reindex(copy=copy, **joined_indexes) for obj in objects)


def align_variables(variables, join='outer', copy=False):
"""Align all DataArrays in the provided dict, leaving other values alone.
"""
alignable = [k for k, v in variables.items() if hasattr(v, 'indexes')]
aligned = align(*[variables[a] for a in alignable], join=join, copy=copy)
new_variables = OrderedDict(variables)
new_variables.update(zip(alignable, aligned))
return new_variables


def reindex_variables(variables, indexes, indexers, method=None,
tolerance=None, copy=True):
"""Conform a dictionary of aligned variables onto a new set of variables,
Expand Down
25 changes: 12 additions & 13 deletions xray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from . import utils
from .pycompat import iteritems, reduce, OrderedDict, basestring
from .variable import Variable
from .variable import Variable, as_variable, Coordinate


def concat(objs, dim=None, data_vars='all', coords='different',
Expand Down Expand Up @@ -120,17 +120,18 @@ def _calc_concat_dim_coord(dim):
Infer the dimension name and 1d coordinate variable (if appropriate)
for concatenating along the new dimension.
"""
from .dataarray import DataArray

if isinstance(dim, basestring):
coord = None
elif not hasattr(dim, 'dims'):
# dim is not a DataArray or Coordinate
dim_name = getattr(dim, 'name', None)
if dim_name is None:
dim_name = 'concat_dim'
coord = DataArray(dim, dims=dim_name, name=dim_name)
coord = Coordinate(dim_name, dim)
dim = dim_name
elif not hasattr(dim, 'name'):
coord = as_variable(dim).to_coord()
dim, = coord.dims
else:
coord = dim
dim, = coord.dims
Expand Down Expand Up @@ -207,6 +208,7 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions):
concat_over = _calc_concat_over(datasets, dim, data_vars, coords)

def insert_result_variable(k, v):
assert isinstance(v, Variable)
if k in datasets[0].coords:
result_coord_names.add(k)
result_vars[k] = v
Expand Down Expand Up @@ -267,22 +269,19 @@ def ensure_common_dims(vars):
combined = Variable.concat(vars, dim, positions)
insert_result_variable(k, combined)

# result._coord_names.update(datasets[0].coords)
result = Dataset(result_vars, attrs=result_attrs)
result = result.set_coords(result_coord_names)

if coord is not None:
# add concat dimension last to ensure that its in the final Dataset
insert_result_variable(coord.name, coord)
# result[coord.name] = coord

result = Dataset(result_vars, attrs=result_attrs)
result = result.set_coords(result_coord_names)
result[coord.name] = coord

return result


def _dataarray_concat(arrays, dim, data_vars, coords, compat,
positions):
from .dataarray import DataArray
arrays = list(arrays)

if data_vars != 'all':
raise ValueError('data_vars is not a valid argument when '
Expand All @@ -297,11 +296,11 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
raise ValueError('array names not identical')
else:
arr = arr.rename(name)
datasets.append(arr._dataset)
datasets.append(arr._to_temp_dataset())

ds = _dataset_concat(datasets, dim, data_vars, coords, compat,
positions)
return DataArray._new_from_dataset_no_copy(ds, name)
return arrays[0]._from_temp_dataset(ds, name)


def _auto_concat(datasets, dim=None):
Expand Down
4 changes: 2 additions & 2 deletions xray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def assign_coords(self, **kwargs):
Dataset.assign
"""
data = self.copy(deep=False)
results = data._calc_assign_results(kwargs)
results = self._calc_assign_results(kwargs)
data.coords.update(results)
return data

Expand Down Expand Up @@ -333,7 +333,7 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None,
RESAMPLE_DIM = '__resample_dim__'
if isinstance(dim, basestring):
dim = self[dim]
group = DataArray(dim, name=RESAMPLE_DIM)
group = DataArray(dim, [(RESAMPLE_DIM, dim)], name=RESAMPLE_DIM)
time_grouper = pd.TimeGrouper(freq=freq, how=how, closed=closed,
label=label, base=base)
gb = self.groupby_cls(self, group, grouper=time_grouper)
Expand Down
91 changes: 57 additions & 34 deletions xray/core/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from contextlib import contextmanager
import pandas as pd

from .pycompat import iteritems, basestring, OrderedDict
from . import formatting
from .merge import merge_dataarray_coords
from .pycompat import iteritems, basestring, OrderedDict


def _coord_merge_finalize(target, other, target_conflicts, other_conflicts,
Expand Down Expand Up @@ -37,16 +38,12 @@ def _dim_shape(var):


class AbstractCoordinates(Mapping):
@property
def _names(self):
return self._dataset._coord_names

def __getitem__(self, key):
if (key in self._names or
(isinstance(key, basestring) and
key.split('.')[0] in self._names)):
# allow indexing current coordinates or components
return self._dataset[key]
return self._data[key]
else:
raise KeyError(key)

Expand All @@ -55,7 +52,7 @@ def __setitem__(self, key, value):

def __iter__(self):
# needs to be in the same order as the dataset variables
for k in self._dataset._variables:
for k in self._variables:
if k in self._names:
yield k

Expand All @@ -65,30 +62,19 @@ def __len__(self):
def __contains__(self, key):
return key in self._names

def __delitem__(self, key):
if key in self:
del self._dataset[key]
else:
raise KeyError(key)

def __repr__(self):
return formatting.coords_repr(self)

@property
def dims(self):
return self._dataset.dims

def to_dataset(self):
"""Convert these coordinates into a new Dataset
"""
return self._dataset._copy_listed(self._names)
return self._data.dims

def to_index(self, ordered_dims=None):
"""Convert all index coordinates into a :py:class:`pandas.MultiIndex`
"""
if ordered_dims is None:
ordered_dims = self.dims
indexes = [self._dataset._variables[k].to_index() for k in ordered_dims]
indexes = [self._variables[k].to_index() for k in ordered_dims]
return pd.MultiIndex.from_product(indexes, names=list(ordered_dims))

def _merge_validate(self, other):
Expand All @@ -100,7 +86,7 @@ def _merge_validate(self, other):
promote_dims = {}
for k in self:
if k in other:
self_var = self._dataset._variables[k]
self_var = self._variables[k]
other_var = other[k].variable
if not self_var.broadcast_equals(other_var):
if k in self.dims and k in other.dims:
Expand Down Expand Up @@ -165,12 +151,31 @@ class DatasetCoordinates(AbstractCoordinates):
objects.
"""
def __init__(self, dataset):
self._dataset = dataset
self._data = dataset

@property
def _names(self):
return self._data._coord_names

@property
def _variables(self):
return self._data._variables

def to_dataset(self):
"""Convert these coordinates into a new Dataset
"""
return self._data._copy_listed(self._names)

def update(self, other):
self._dataset.update(other)
self._data.update(other)
self._names.update(other.keys())

def __delitem__(self, key):
if key in self:
del self._data[key]
else:
raise KeyError(key)


class DataArrayCoordinates(AbstractCoordinates):
"""Dictionary like container for DataArray coordinates.
Expand All @@ -180,20 +185,38 @@ class DataArrayCoordinates(AbstractCoordinates):
objects.
"""
def __init__(self, dataarray):
self._dataarray = dataarray
self._dataset = dataarray._dataset
self._data = dataarray

def update(self, other):
with self._dataarray._set_new_dataset() as ds:
ds.coords.update(other)
bad_dims = [d for d in ds.dims if d not in self.dims]
if bad_dims:
raise ValueError('DataArray does not include all coordinate '
'dimensions: %s' % bad_dims)
@property
def _names(self):
return set(self._data._coords)

@property
def dims(self):
return self._dataarray.dims
def _variables(self):
return self._data._coords

def _to_dataset(self, shallow_copy=True):
from .dataset import Dataset
coords = OrderedDict((k, v.copy(deep=False) if shallow_copy else v)
for k, v in self._data._coords.items())
dims = dict(zip(self.dims, self._data.shape))
return Dataset._construct_direct(coords, coord_names=set(self._names),
dims=dims, attrs=None)

def to_dataset(self):
return self._to_dataset()

def update(self, other):
new_vars = merge_dataarray_coords(
self._data.indexes, self._data._coords, other)

self._data._coords = new_vars

def __delitem__(self, key):
if key in self.dims:
raise ValueError('cannot delete a coordinate corresponding to a '
'DataArray dimension')
del self._data._coords[key]


class Indexes(Mapping):
Expand Down
Loading