Skip to content

Commit

Permalink
Merge pull request pydata#192 from shoyer/modify-in-place
Browse files Browse the repository at this point in the history
Enhanced support for modifying Dataset & DataArray properties in place
  • Loading branch information
shoyer committed Jul 31, 2014
2 parents 9597d9e + a7f5351 commit 20d1939
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 19 deletions.
82 changes: 78 additions & 4 deletions test/test_data_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def test_repr(self):

def test_properties(self):
self.assertDatasetIdentical(self.dv.dataset, self.ds)
self.assertEqual(self.dv.name, 'foo')
self.assertVariableEqual(self.dv.variable, self.v)
self.assertArrayEqual(self.dv.values, self.v.values)
for attr in ['dimensions', 'dtype', 'shape', 'size', 'ndim', 'attrs']:
Expand All @@ -47,13 +46,39 @@ def test_properties(self):
self.assertEqual(list(self.dv.coordinates), list(self.ds.coordinates))
for k, v in iteritems(self.dv.coordinates):
self.assertArrayEqual(v, self.ds.coordinates[k])
with self.assertRaises(AttributeError):
self.dv.name = 'bar'
with self.assertRaises(AttributeError):
self.dv.dataset = self.ds
self.assertIsInstance(self.ds['x'].as_index, pd.Index)
with self.assertRaisesRegexp(ValueError, 'must be 1-dimensional'):
self.ds['foo'].as_index
with self.assertRaises(AttributeError):
self.dv.variable = self.v

def test_name(self):
arr = self.dv
self.assertEqual(arr.name, 'foo')

copied = arr.copy()
arr.name = 'bar'
self.assertEqual(arr.name, 'bar')
self.assertDataArrayEqual(copied, arr)

actual = DataArray(Coordinate('x', [3]))
actual.name = 'y'
expected = DataArray(Coordinate('y', [3]))
self.assertDataArrayIdentical(actual, expected)

def test_dimensions(self):
arr = self.dv
self.assertEqual(arr.dimensions, ('x', 'y'))

arr.dimensions = ('w', 'z')
self.assertEqual(arr.dimensions, ('w', 'z'))

x = Dataset({'x': ('x', np.arange(5))})['x']
x.dimensions = ('y',)
self.assertEqual(x.dimensions, ('y',))
self.assertEqual(x.name, 'y')

def test_encoding(self):
expected = {'foo': 'bar'}
Expand Down Expand Up @@ -166,10 +191,13 @@ def test_constructor_from_self_described(self):
expected = DataArray([data], expected.coordinates, ['dim_0', 'x', 'y'])
self.assertDataArrayIdentical(expected, actual)

expected = DataArray(['a', 'b'], name='foo')
expected = Dataset({'foo': ('foo', ['a', 'b'])})['foo']
actual = DataArray(pd.Index(['a', 'b'], name='foo'))
self.assertDataArrayIdentical(expected, actual)

actual = DataArray(Coordinate('foo', ['a', 'b']))
self.assertDataArrayIdentical(expected, actual)

def test_equals_and_identical(self):
da2 = self.dv.copy()
self.assertTrue(self.dv.equals(da2))
Expand Down Expand Up @@ -275,6 +303,52 @@ def test_coordinates(self):
actual = repr(da.coordinates)
self.assertEquals(expected, actual)

def test_coordinates_modify(self):
da = DataArray(np.zeros((2, 3)), dimensions=['x', 'y'])

for k, v in [('x', ['a', 'b']), (0, ['c', 'd']), (-2, ['e', 'f'])]:
da.coordinates[k] = v
self.assertArrayEqual(da.coordinates[k], v)

actual = da.copy()
orig_dataset = actual.dataset
actual.coordinates = [[5, 6], [7, 8, 9]]
expected = DataArray(np.zeros((2, 3)), coordinates=[[5, 6], [7, 8, 9]],
dimensions=['x', 'y'])
self.assertDataArrayIdentical(actual, expected)
self.assertIsNot(actual.dataset, orig_dataset)

actual = da.copy()
actual.coordinates = expected.coordinates
self.assertDataArrayIdentical(actual, expected)

actual = da.copy()
expected = DataArray(np.zeros((2, 3)), coordinates=[[5, 6], [7, 8, 9]],
dimensions=['foo', 'bar'])
actual.coordinates = expected.coordinates
self.assertDataArrayIdentical(actual, expected)

with self.assertRaisesRegexp(ValueError, 'coordinate has size'):
da.coordinates['x'] = ['a']

with self.assertRaises(IndexError):
da.coordinates['foobar'] = np.arange(4)

with self.assertRaisesRegexp(ValueError, 'coordinate has size'):
da.coordinates = da.isel(y=slice(2)).coordinates

# modify the coordinates on a coordinate itself
x = DataArray(Coordinate('x', [10.0, 20.0, 30.0]))

actual = x.copy()
actual.coordinates = [[0, 1, 2]]
expected = DataArray(Coordinate('x', range(3)))
self.assertDataArrayIdentical(actual, expected)

actual = DataArray(Coordinate('y', [-10, -20, -30]))
actual.coordinates = expected.coordinates
self.assertDataArrayIdentical(actual, expected)

def test_reindex(self):
foo = self.dv
bar = self.dv[:2, :2]
Expand Down
25 changes: 25 additions & 0 deletions test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,22 @@ def test_coordinates_properties(self):
actual = repr(data.coordinates)
self.assertEquals(expected, actual)

def test_coordinates_modify(self):
data = Dataset({'x': ('x', [-1, -2]),
'y': ('y', [0, 1, 2]),
'foo': (['x', 'y'], np.random.randn(2, 3))})

actual = data.copy(deep=True)
actual.coordinates['x'] = ['a', 'b']
self.assertArrayEqual(actual['x'], ['a', 'b'])

actual = data.copy(deep=True)
actual.coordinates['z'] = ['a', 'b']
self.assertArrayEqual(actual['z'], ['a', 'b'])

with self.assertRaisesRegexp(ValueError, 'coordinate has size'):
data.coordinates['x'] = [-1]

def test_equals_and_identical(self):
data = create_test_data(seed=42)
self.assertTrue(data.equals(data))
Expand Down Expand Up @@ -429,6 +445,15 @@ def test_rename(self):
with self.assertRaises(UnexpectedDataAccess):
renamed['renamed_var1'].values

def test_rename_inplace(self):
data = Dataset({'z': ('x', [2, 3, 4])})
copied = data.copy()
renamed = data.rename({'x': 'y'})
data.rename({'x': 'y'}, inplace=True)
self.assertDatasetIdentical(data, renamed)
self.assertFalse(data.equals(copied))
self.assertEquals(data.dimensions, {'y': 3})

def test_update(self):
data = create_test_data(seed=0)
var2 = Variable('dim1', np.arange(100))
Expand Down
8 changes: 7 additions & 1 deletion test/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,6 @@ def test_data(self):
self.assertEqual(float, x.dtype)
self.assertArrayEqual(np.arange(3), x)
self.assertEqual(float, x.values.dtype)
self.assertEqual('x', x.name)
# after inspecting x.values, the Coordinate value will be saved as an Index
self.assertIsInstance(x._data, PandasIndexAdapter)
with self.assertRaisesRegexp(TypeError, 'cannot be modified'):
Expand All @@ -603,6 +602,13 @@ def test_avoid_index_dtype_inference(self):
self.assertEqual(t.dtype, object)
self.assertEqual(t[:2].dtype, object)

def test_name(self):
coord = Coordinate('x', [10.0])
self.assertEqual(coord.name, 'x')

with self.assertRaises(AttributeError):
coord.name = 'y'


class TestAsCompatibleData(TestCase):
def test_unchanged_types(self):
Expand Down
14 changes: 14 additions & 0 deletions xray/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,20 @@ def __repr__(self):
return '\n'.join(_wrap_indent(repr(v.as_index), '%s: ' % k)
for k, v in self.items())

@staticmethod
def _convert_to_coord(key, value, expected_size=None):
from .variable import Coordinate, as_variable

if not isinstance(value, AbstractArray):
value = Coordinate(key, value)
coord = as_variable(value).to_coord()

if expected_size is not None and coord.size != expected_size:
raise ValueError('new coordinate has size %s but the existing '
'coordinate has size %s'
% (coord.size, expected_size))
return coord


def _summarize_attributes(data):
if data.attrs:
Expand Down
80 changes: 70 additions & 10 deletions xray/data_array.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import contextlib
import functools
import operator
import warnings
Expand Down Expand Up @@ -80,6 +81,12 @@ def __setitem__(self, key, value):
self.data_array[self._remap_key(key)] = value


def _assert_coordinates_same_size(orig, new):
if not new.size == orig.size:
raise ValueError('new coordinate has size %s but the existing '
'coordinate has size %s' % (new.size, orig.size))


class DataArrayCoordinates(AbstractCoordinates):
"""Dictionary like container for DataArray coordinates.
Expand All @@ -96,6 +103,17 @@ def __getitem__(self, key):
else:
raise KeyError(repr(key))

def __setitem__(self, key, value):
if isinstance(key, (int, np.integer)):
key = self._data.dimensions[key]

if key not in self:
raise IndexError('%s is not a coordinate')

coord = self._convert_to_coord(key, value, self[key].size)
with self._data._set_new_dataset() as ds:
ds._variables[key] = coord


class DataArray(AbstractArray):
"""N-dimensional array with labeled coordinates and dimensions.
Expand Down Expand Up @@ -178,6 +196,8 @@ def __init__(self, data=None, coordinates=None, dimensions=None, name=None,
coordinates = [data.index]
elif isinstance(data, pd.DataFrame):
coordinates = [data.index, data.columns]
elif isinstance(data, (pd.Index, variable.Coordinate)):
coordinates = [data]
elif isinstance(data, pd.Panel):
coordinates = [data.items, data.major_axis, data.minor_axis]
if dimensions is None:
Expand All @@ -197,12 +217,10 @@ def __init__(self, data=None, coordinates=None, dimensions=None, name=None,
dimensions, data, attributes, encoding)
dataset = xray.Dataset(variables)
else:
# move this back to an alternate constructor?
if name not in dataset and name not in dataset.virtual_variables:
raise ValueError('name %r must be a variable in dataset %s' %
(name, dataset))
# make a shallow copy of the dataset so we can safely modify the
# array in-place?
# dataset = dataset.copy(deep=False)

self._dataset = dataset
self._name = name
Expand All @@ -220,19 +238,25 @@ def name(self):
"""
return self._name

@contextlib.contextmanager
def _set_new_dataset(self):
"""Context manager to use for modifying _dataset, in a manner that
can be safely rolled back if an error is encountered.
"""
ds = self.dataset.copy(deep=False)
yield ds
self._dataset = ds

@name.setter
def name(self, value):
raise AttributeError('cannot modify the name of a %s inplace; use the '
"'rename' method instead" % type(self).__name__)
with self._set_new_dataset() as ds:
ds.rename({self.name: value}, inplace=True)
self._name = value

@property
def variable(self):
return self.dataset.variables[self.name]

@variable.setter
def variable(self, value):
self.dataset[self.name] = value

@property
def dtype(self):
return self.variable.dtype
Expand Down Expand Up @@ -274,6 +298,17 @@ def as_index(self):
def dimensions(self):
return self.variable.dimensions

@dimensions.setter
def dimensions(self, value):
with self._set_new_dataset() as ds:
if not len(value) == self.ndim:
raise ValueError('%s dimensions supplied but data has ndim=%s'
% (len(value), self.ndim))
name_map = dict(zip(self.dimensions, value))
ds.rename(name_map, inplace=True)
if self.name in name_map:
self._name = name_map[self.name]

def _key_to_indexers(self, key):
return OrderedDict(
zip(self.dimensions, indexing.expanded_indexer(key, self.ndim)))
Expand Down Expand Up @@ -350,6 +385,31 @@ def coordinates(self):
"""
return DataArrayCoordinates(self)

@coordinates.setter
def coordinates(self, value):
if not len(value) == self.ndim:
raise ValueError('%s coordinates supplied but data has ndim=%s'
% (len(value), self.ndim))
with self._set_new_dataset() as ds:
# TODO: allow setting to dict-like objects other than
# DataArrayCoordinates?
if isinstance(value, DataArrayCoordinates):
# yes, this is regretably complex and probably slow
name_map = dict(zip(self.dimensions, value.keys()))
ds.rename(name_map, inplace=True)
name = name_map.get(self.name, self.name)
dimensions = ds[name].dimensions
value = value.values()
else:
name = self.name
dimensions = self.dimensions

for k, v in zip(dimensions, value):
coord = DataArrayCoordinates._convert_to_coord(
k, v, expected_size=ds.coordinates[k].size)
ds[k] = coord
self._name = name

def load_data(self):
"""Manually trigger loading of this array's data from disk or a
remote source into memory and return this array.
Expand Down Expand Up @@ -836,7 +896,7 @@ def _inplace_binary_op(f):
def func(self, other):
self._check_coords_compat(other)
other_array = getattr(other, 'variable', other)
self.variable = f(self.variable, other_array)
f(self.variable, other_array)
if hasattr(other, 'coordinates'):
self.dataset.merge(other.coordinates, inplace=True)
return self
Expand Down
18 changes: 16 additions & 2 deletions xray/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ def __getitem__(self, key):
else:
raise KeyError(repr(key))

def __setitem__(self, key, value):
expected_size = self[key].size if key in self else None
self._data[key] = self._convert_to_coord(key, value, expected_size)


def as_dataset(obj):
"""Cast the given object to a Dataset.
Expand Down Expand Up @@ -852,14 +856,17 @@ def get_fill_value_and_dtype(dtype):
variables[name] = new_var
return type(self)(variables, self.attrs)

def rename(self, name_dict):
def rename(self, name_dict, inplace=False):
"""Returns a new object with renamed variables and dimensions.
Parameters
----------
name_dict : dict-like
Dictionary whose keys are current variable or dimension names and
whose values are new names.
inplace : bool, optional
If True, rename variables and dimensions in-place. Otherwise,
return a new dataset object.
Returns
-------
Expand All @@ -877,7 +884,14 @@ def rename(self, name_dict):
var = v.copy(deep=False)
var.dimensions = dims
variables[name] = var
return type(self)(variables, self.attrs)

if inplace:
self._dimensions = _calculate_dimensions(variables)
self._variables = variables
obj = self
else:
obj = type(self)(variables, self.attrs)
return obj

def update(self, other, inplace=True):
"""Update this dataset's variables and attributes with those from
Expand Down
Loading

0 comments on commit 20d1939

Please sign in to comment.