Skip to content

Commit 96eeb13

Browse files
committed
Rework DataArray internals
Fixes GH367 Fixes GH634 The internal data model used by :py:class:`~xray.DataArray` has been rewritten to fix several outstanding issues (:issue:`367`, :issue:`634`, `this stackoverflow report`_). Internally, ``DataArray`` is now implemented in terms of ``._variable`` and ``._coords`` attributes instead of holding variables in a ``Dataset`` object.
1 parent 76d15b2 commit 96eeb13

17 files changed

+668
-423
lines changed

doc/whats-new.rst

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,64 @@ What's New
99
import xray
1010
np.random.seed(123456)
1111
12+
v0.7.0 (unreleased)
13+
-------------------
14+
15+
.. _v0.7.0.breaking:
16+
17+
Breaking changes
18+
~~~~~~~~~~~~~~~~
19+
20+
- The internal data model used by :py:class:`~xray.DataArray` has been
21+
rewritten to fix several outstanding issues (:issue:`367`, :issue:`634`,
22+
`this stackoverflow report`_). Internally, ``DataArray`` is now implemented
23+
in terms of ``._variable`` and ``._coords`` attributes instead of holding
24+
variables in a ``Dataset`` object.
25+
26+
This refactor ensures that if a DataArray has the
27+
same name as one of its coordinates, the array and the coordinate no longer
28+
share the same data.
29+
30+
In practice, this means that creating a DataArray with the same ``name`` as
31+
one of its dimensions no longer automatically uses that array to label the
32+
corresponding coordinate. You will now need to provide coordinate labels
33+
explicitly. Here's the old behavior:
34+
35+
.. ipython::
36+
:verbatim:
37+
38+
In [2]: xray.DataArray([4, 5, 6], dims='x', name='x')
39+
Out[2]:
40+
<xray.DataArray 'x' (x: 3)>
41+
array([4, 5, 6])
42+
Coordinates:
43+
* x (x) int64 4 5 6
44+
45+
and the new behavior (compare the values of the ``x`` coordinate):
46+
47+
.. ipython::
48+
:verbatim:
49+
50+
In [2]: xray.DataArray([4, 5, 6], dims='x', name='x')
51+
Out[2]:
52+
<xray.DataArray 'x' (x: 3)>
53+
array([4, 5, 6])
54+
Coordinates:
55+
* x (x) int64 0 1 2
56+
57+
- It is no longer possible to convert a DataArray to a Dataset with
58+
:py:meth:`xray.DataArray.to_dataset` if it is unnamed. This will now
59+
raise ``ValueError``. If the array is unnamed, you need to supply the
60+
``name`` argument.
61+
62+
.. _this stackoverflow report: http://stackoverflow.com/questions/33158558/python-xray-extract-first-and-last-time-value-within-each-month-of-a-timeseries
63+
64+
Bug fixes
65+
~~~~~~~~~
66+
67+
- Fixes for several issues found on ``DataArray`` objects with the same name
68+
as one of their coordinates (see :ref:`v0.7.0.breaking` for more details).
69+
1270
v0.6.2 (unreleased)
1371
-------------------
1472

xray/core/alignment.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,16 @@ def partial_align(*objects, **kwargs):
100100
return tuple(obj.reindex(copy=copy, **joined_indexes) for obj in objects)
101101

102102

103+
def align_variables(variables, join='outer', copy=False):
104+
"""Align all DataArrays in the provided dict, leaving other values alone.
105+
"""
106+
alignable = [k for k, v in variables.items() if hasattr(v, 'indexes')]
107+
aligned = align(*[variables[a] for a in alignable], join=join, copy=copy)
108+
new_variables = OrderedDict(variables)
109+
new_variables.update(zip(alignable, aligned))
110+
return new_variables
111+
112+
103113
def reindex_variables(variables, indexes, indexers, method=None,
104114
tolerance=None, copy=True):
105115
"""Conform a dictionary of aligned variables onto a new set of variables,

xray/core/combine.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from . import utils
66
from .pycompat import iteritems, reduce, OrderedDict, basestring
7-
from .variable import Variable
7+
from .variable import Variable, as_variable, Coordinate
88

99

1010
def concat(objs, dim=None, data_vars='all', coords='different',
@@ -120,17 +120,18 @@ def _calc_concat_dim_coord(dim):
120120
Infer the dimension name and 1d coordinate variable (if appropriate)
121121
for concatenating along the new dimension.
122122
"""
123-
from .dataarray import DataArray
124-
125123
if isinstance(dim, basestring):
126124
coord = None
127125
elif not hasattr(dim, 'dims'):
128126
# dim is not a DataArray or Coordinate
129127
dim_name = getattr(dim, 'name', None)
130128
if dim_name is None:
131129
dim_name = 'concat_dim'
132-
coord = DataArray(dim, dims=dim_name, name=dim_name)
130+
coord = Coordinate(dim_name, dim)
133131
dim = dim_name
132+
elif not hasattr(dim, 'name'):
133+
coord = as_variable(dim).to_coord()
134+
dim, = coord.dims
134135
else:
135136
coord = dim
136137
dim, = coord.dims
@@ -207,6 +208,7 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions):
207208
concat_over = _calc_concat_over(datasets, dim, data_vars, coords)
208209

209210
def insert_result_variable(k, v):
211+
assert isinstance(v, Variable)
210212
if k in datasets[0].coords:
211213
result_coord_names.add(k)
212214
result_vars[k] = v
@@ -267,22 +269,19 @@ def ensure_common_dims(vars):
267269
combined = Variable.concat(vars, dim, positions)
268270
insert_result_variable(k, combined)
269271

270-
# result._coord_names.update(datasets[0].coords)
272+
result = Dataset(result_vars, attrs=result_attrs)
273+
result = result.set_coords(result_coord_names)
271274

272275
if coord is not None:
273276
# add concat dimension last to ensure that its in the final Dataset
274-
insert_result_variable(coord.name, coord)
275-
# result[coord.name] = coord
276-
277-
result = Dataset(result_vars, attrs=result_attrs)
278-
result = result.set_coords(result_coord_names)
277+
result[coord.name] = coord
279278

280279
return result
281280

282281

283282
def _dataarray_concat(arrays, dim, data_vars, coords, compat,
284283
positions):
285-
from .dataarray import DataArray
284+
arrays = list(arrays)
286285

287286
if data_vars != 'all':
288287
raise ValueError('data_vars is not a valid argument when '
@@ -297,11 +296,11 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
297296
raise ValueError('array names not identical')
298297
else:
299298
arr = arr.rename(name)
300-
datasets.append(arr._dataset)
299+
datasets.append(arr._to_temp_dataset())
301300

302301
ds = _dataset_concat(datasets, dim, data_vars, coords, compat,
303302
positions)
304-
return DataArray._new_from_dataset_no_copy(ds, name)
303+
return arrays[0]._from_temp_dataset(ds, name)
305304

306305

307306
def _auto_concat(datasets, dim=None):

xray/core/common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def assign_coords(self, **kwargs):
186186
Dataset.assign
187187
"""
188188
data = self.copy(deep=False)
189-
results = data._calc_assign_results(kwargs)
189+
results = self._calc_assign_results(kwargs)
190190
data.coords.update(results)
191191
return data
192192

@@ -333,7 +333,7 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None,
333333
RESAMPLE_DIM = '__resample_dim__'
334334
if isinstance(dim, basestring):
335335
dim = self[dim]
336-
group = DataArray(dim, name=RESAMPLE_DIM)
336+
group = DataArray(dim, [(RESAMPLE_DIM, dim)], name=RESAMPLE_DIM)
337337
time_grouper = pd.TimeGrouper(freq=freq, how=how, closed=closed,
338338
label=label, base=base)
339339
gb = self.groupby_cls(self, group, grouper=time_grouper)

xray/core/coordinates.py

Lines changed: 57 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
from contextlib import contextmanager
33
import pandas as pd
44

5-
from .pycompat import iteritems, basestring, OrderedDict
65
from . import formatting
6+
from .merge import merge_dataarray_coords
7+
from .pycompat import iteritems, basestring, OrderedDict
78

89

910
def _coord_merge_finalize(target, other, target_conflicts, other_conflicts,
@@ -37,16 +38,12 @@ def _dim_shape(var):
3738

3839

3940
class AbstractCoordinates(Mapping):
40-
@property
41-
def _names(self):
42-
return self._dataset._coord_names
43-
4441
def __getitem__(self, key):
4542
if (key in self._names or
4643
(isinstance(key, basestring) and
4744
key.split('.')[0] in self._names)):
4845
# allow indexing current coordinates or components
49-
return self._dataset[key]
46+
return self._data[key]
5047
else:
5148
raise KeyError(key)
5249

@@ -55,7 +52,7 @@ def __setitem__(self, key, value):
5552

5653
def __iter__(self):
5754
# needs to be in the same order as the dataset variables
58-
for k in self._dataset._variables:
55+
for k in self._variables:
5956
if k in self._names:
6057
yield k
6158

@@ -65,30 +62,19 @@ def __len__(self):
6562
def __contains__(self, key):
6663
return key in self._names
6764

68-
def __delitem__(self, key):
69-
if key in self:
70-
del self._dataset[key]
71-
else:
72-
raise KeyError(key)
73-
7465
def __repr__(self):
7566
return formatting.coords_repr(self)
7667

7768
@property
7869
def dims(self):
79-
return self._dataset.dims
80-
81-
def to_dataset(self):
82-
"""Convert these coordinates into a new Dataset
83-
"""
84-
return self._dataset._copy_listed(self._names)
70+
return self._data.dims
8571

8672
def to_index(self, ordered_dims=None):
8773
"""Convert all index coordinates into a :py:class:`pandas.MultiIndex`
8874
"""
8975
if ordered_dims is None:
9076
ordered_dims = self.dims
91-
indexes = [self._dataset._variables[k].to_index() for k in ordered_dims]
77+
indexes = [self._variables[k].to_index() for k in ordered_dims]
9278
return pd.MultiIndex.from_product(indexes, names=list(ordered_dims))
9379

9480
def _merge_validate(self, other):
@@ -100,7 +86,7 @@ def _merge_validate(self, other):
10086
promote_dims = {}
10187
for k in self:
10288
if k in other:
103-
self_var = self._dataset._variables[k]
89+
self_var = self._variables[k]
10490
other_var = other[k].variable
10591
if not self_var.broadcast_equals(other_var):
10692
if k in self.dims and k in other.dims:
@@ -165,12 +151,31 @@ class DatasetCoordinates(AbstractCoordinates):
165151
objects.
166152
"""
167153
def __init__(self, dataset):
168-
self._dataset = dataset
154+
self._data = dataset
155+
156+
@property
157+
def _names(self):
158+
return self._data._coord_names
159+
160+
@property
161+
def _variables(self):
162+
return self._data._variables
163+
164+
def to_dataset(self):
165+
"""Convert these coordinates into a new Dataset
166+
"""
167+
return self._data._copy_listed(self._names)
169168

170169
def update(self, other):
171-
self._dataset.update(other)
170+
self._data.update(other)
172171
self._names.update(other.keys())
173172

173+
def __delitem__(self, key):
174+
if key in self:
175+
del self._data[key]
176+
else:
177+
raise KeyError(key)
178+
174179

175180
class DataArrayCoordinates(AbstractCoordinates):
176181
"""Dictionary like container for DataArray coordinates.
@@ -180,20 +185,38 @@ class DataArrayCoordinates(AbstractCoordinates):
180185
objects.
181186
"""
182187
def __init__(self, dataarray):
183-
self._dataarray = dataarray
184-
self._dataset = dataarray._dataset
188+
self._data = dataarray
185189

186-
def update(self, other):
187-
with self._dataarray._set_new_dataset() as ds:
188-
ds.coords.update(other)
189-
bad_dims = [d for d in ds.dims if d not in self.dims]
190-
if bad_dims:
191-
raise ValueError('DataArray does not include all coordinate '
192-
'dimensions: %s' % bad_dims)
190+
@property
191+
def _names(self):
192+
return set(self._data._coords)
193193

194194
@property
195-
def dims(self):
196-
return self._dataarray.dims
195+
def _variables(self):
196+
return self._data._coords
197+
198+
def _to_dataset(self, shallow_copy=True):
199+
from .dataset import Dataset
200+
coords = OrderedDict((k, v.copy(deep=False) if shallow_copy else v)
201+
for k, v in self._data._coords.items())
202+
dims = dict(zip(self.dims, self._data.shape))
203+
return Dataset._construct_direct(coords, coord_names=set(self._names),
204+
dims=dims, attrs=None)
205+
206+
def to_dataset(self):
207+
return self._to_dataset()
208+
209+
def update(self, other):
210+
new_vars = merge_dataarray_coords(
211+
self._data.indexes, self._data._coords, other)
212+
213+
self._data._coords = new_vars
214+
215+
def __delitem__(self, key):
216+
if key in self.dims:
217+
raise ValueError('cannot delete a coordinate corresponding to a '
218+
'DataArray dimension')
219+
del self._data._coords[key]
197220

198221

199222
class Indexes(Mapping):

0 commit comments

Comments
 (0)