Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add 'no_conflicts' as compat option for merging non-conflicting data #996

Merged
merged 18 commits into from
Sep 15, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions doc/combining.rst
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,21 @@ numpy):

Note that ``NaN`` does not compare equal to ``NaN`` in element-wise comparison;
you may need to deal with missing values explicitly.

Merging with 'no_conflicts'
~~~~~~~~~~~~~~~~~~~~~~~~~~~

The ``compat`` argument ``'no_conflicts'`` is only available when
combining xarray objects with ``merge``. In addition to the above comparison
methods it allows the merging of xarray objects with locations where *either*
have ``NaN`` values. This can be used to combine data with overlapping
coordinates as long as any non-missing values agree or are disjoint:

.. ipython:: python
ds1 = xr.Dataset({'a': ('x', [10, 20, 30, np.nan])}, {'x': [1, 2, 3, 4]})
ds2 = xr.Dataset({'a': ('x', [np.nan, 30, 40, 50])}, {'x': [2, 3, 4, 5]})
xr.merge([ds1, ds2], compat='no_conflicts')

Note that due to the underlying representation of missing values as floating
point numbers (``NaN``), variable data type is not always preserved when merging
in this manner.
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ By `Robin Wilson <https://github.com/robintw>`_.
(see :ref:`multi-level indexing`).
By `Benoit Bovy <https://github.com/benbovy>`_.

- Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the
combination of xarray objects with disjoint (:issue:`742`) or
overlapping (:issue:`835`) coordinates as long as any present data agrees.
By `Johnnie Gray <https://github.com/jcmgray>`_.

Bug fixes
~~~~~~~~~

Expand Down
8 changes: 6 additions & 2 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,8 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
overwrite_vars : str or sequence, optional
If provided, update variables of these name(s) without checking for
conflicts in this dataset.
compat : {'broadcast_equals', 'equals', 'identical'}, optional
compat : {'broadcast_equals', 'equals', 'identical',
'no_conflicts'}, optional
String indicating how to compare variables of the same name for
potential conflicts:

Expand All @@ -1462,6 +1463,9 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
- 'equals': all values and dimensions must be the same.
- 'identical': all values, dimensions and attributes must be the
same.
- 'no_conflicts': only values which are not null in both datasets
must be equal. The returned dataset then contains the combination
of all non-null values.
join : {'outer', 'inner', 'left', 'right'}, optional
Method for joining ``self`` and ``other`` along shared dimensions:

Expand All @@ -1477,7 +1481,7 @@ def merge(self, other, inplace=False, overwrite_vars=set(),

Raises
------
ValueError
MergeError
If any variables conflict (see ``compat``).
"""
variables, coord_names, dims = dataset_merge_method(
Expand Down
32 changes: 23 additions & 9 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
_VALID_COMPAT = Frozen({'identical': 0,
'equals': 1,
'broadcast_equals': 2,
'minimal': 3})
'minimal': 3,
'no_conflicts': 4})


def broadcast_dimension_size(variables):
Expand Down Expand Up @@ -48,7 +49,8 @@ def unique_variable(name, variables, compat='broadcast_equals'):
variables : list of xarray.Variable
List of Variable objects, all of which go by the same name in different
inputs.
compat : {'identical', 'equals', 'broadcast_equals'}, optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Type of equality check to use.

Returns
Expand All @@ -61,19 +63,27 @@ def unique_variable(name, variables, compat='broadcast_equals'):
"""
out = variables[0]
if len(variables) > 1:
combine_method = None

if compat == 'minimal':
compat = 'broadcast_equals'

if compat == 'broadcast_equals':
dim_lengths = broadcast_dimension_size(variables)
out = out.expand_dims(dim_lengths)

if compat == 'no_conflicts':
combine_method = 'fillna'

for var in variables[1:]:
if not getattr(out, compat)(var):
raise MergeError('conflicting values for variable %r on '
'objects to be combined:\n'
'first value: %r\nsecond value: %r'
% (name, out, var))
if combine_method:
out = getattr(out, combine_method)(var)

return out


Expand Down Expand Up @@ -110,8 +120,9 @@ def merge_variables(
priority_vars : mapping with Variable values, optional
If provided, variables are always taken from this dict in preference to
the input variable dictionaries, without checking for conflicts.
compat : {'identical', 'equals', 'broadcast_equals', 'minimal'}, optional
Type of equality check to use when checking for conflicts.
compat : {'identical', 'equals', 'broadcast_equals',
'minimal', 'no_conflicts'}, optional
Type of equality check to use wben checking for conflicts.

Returns
-------
Expand Down Expand Up @@ -342,7 +353,8 @@ def _get_priority_vars(objects, priority_arg, compat='equals'):
Dictionaries in which to find the priority variables.
priority_arg : int or None
Integer object whose variable should take priority.
compat : 'broadcast_equals', 'equals' or 'identical', optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Compatibility checks to use when merging variables.

Returns
Expand Down Expand Up @@ -395,9 +407,10 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
----------
objs : list of mappings
All values must be convertable to labeled arrays.
compat : 'broadcast_equals', 'equals' or 'identical', optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Compatibility checks to use when merging variables.
join : 'outer', 'inner', 'left' or 'right', optional
join : {'outer', 'inner', 'left', 'right'}, optional
How to combine objects with different indexes.
priority_arg : integer, optional
Optional argument in `objs` that takes precedence over the others.
Expand Down Expand Up @@ -461,9 +474,10 @@ def merge(objects, compat='broadcast_equals', join='outer'):
objects : Iterable[Union[xarray.Dataset, xarray.DataArray, dict]]
Merge together all variables from these objects. If any of them are
DataArray objects, they must have a name.
compat : 'broadcast_equals', 'equals' or 'identical', optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Compatibility checks to use when merging variables.
join : 'outer', 'inner', 'left' or 'right', optional
join : {'outer', 'inner', 'left', 'right'}, optional
How to combine objects with different indexes.

Returns
Expand Down
33 changes: 25 additions & 8 deletions xarray/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,20 @@ def _fail_on_dask_array_input(values, msg=None, func_name=None):

around = _dask_or_eager_func('around')
isclose = _dask_or_eager_func('isclose')
isnull = _dask_or_eager_func('isnull', pd)
notnull = _dask_or_eager_func('notnull', pd)
_isnull = _dask_or_eager_func('isnull', pd)


def isnull(data):
# GH837, GH861
# isnull fcn from pandas will throw TypeError when run on numpy structured
# array therefore for dims that are np structured arrays we assume all
# data is present
try:
return _isnull(data)
except TypeError:
return np.zeros(data.shape, dtype=bool)


transpose = _dask_or_eager_func('transpose')
where = _dask_or_eager_func('where', n_array_args=3)
Expand Down Expand Up @@ -125,17 +137,22 @@ def array_equiv(arr1, arr2):
return False

flag_array = (arr1 == arr2)
flag_array |= (isnull(arr1) & isnull(arr2))

# GH837, GH861
# isnull fcn from pandas will throw TypeError when run on numpy structured array
# therefore for dims that are np structured arrays we skip testing for nan
return bool(flag_array.all())

try:

flag_array |= (isnull(arr1) & isnull(arr2))
def array_notnull_equiv(arr1, arr2):
"""Like np.array_equal, but also allows values to be NaN in either or both
arrays
"""
arr1, arr2 = as_like_arrays(arr1, arr2)
if arr1.shape != arr2.shape:
return False

except TypeError:
pass
flag_array = (arr1 == arr2)
flag_array |= isnull(arr1)
flag_array |= isnull(arr2)

return bool(flag_array.all())

Expand Down
18 changes: 18 additions & 0 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,24 @@ def identical(self, other):
except (TypeError, AttributeError):
return False

def _data_no_conflicts(self, other):
return (self._data is other._data or
ops.array_notnull_equiv(self.data, other.data))

def no_conflicts(self, other):
"""True if the intersection of two Variable's non-null data is
equal; otherwise false.

Variables can thus still be equal if there are locations where either,
or both, contain NaN values.
"""
other = getattr(other, 'variable', other)
try:
return (self.dims == other.dims and
self._data_no_conflicts(other))
except (TypeError, AttributeError):
return False

@property
def real(self):
return type(self)(self.dims, self.data.real, self._attrs)
Expand Down
76 changes: 74 additions & 2 deletions xarray/test/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,51 @@ def test_merge_error(self):
with self.assertRaises(xr.MergeError):
xr.merge([ds, ds + 1])

def test_merge_no_conflicts_single_var(self):
ds1 = xr.Dataset({'a': ('x', [1, 2])})
ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]})
expected = xr.Dataset({'a': ('x', [1, 2, 3])})
assert expected.identical(xr.merge([ds1, ds2],
compat='no_conflicts'))
assert expected.identical(xr.merge([ds2, ds1],
compat='no_conflicts'))
assert ds1.identical(xr.merge([ds1, ds2],
compat='no_conflicts',
join='left'))
assert ds2.identical(xr.merge([ds1, ds2],
compat='no_conflicts',
join='right'))
expected = xr.Dataset({'a': ('x', [2]), 'x': [1]})
assert expected.identical(xr.merge([ds1, ds2],
compat='no_conflicts',
join='inner'))

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
xr.merge([ds1, ds3], compat='no_conflicts')

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('y', [2, 3]), 'y': [1, 2]})
xr.merge([ds1, ds3], compat='no_conflicts')

def test_merge_no_conflicts_multi_var(self):
data = create_test_data()
data1 = data.copy(deep=True)
data2 = data.copy(deep=True)

expected = data[['var1', 'var2']]
actual = xr.merge([data1.var1, data2.var2], compat='no_conflicts')
assert expected.identical(actual)

data1['var1'][:, :5] = np.nan
data2['var1'][:, 5:] = np.nan
data1['var2'][:4, :] = np.nan
data2['var2'][4:, :] = np.nan
del data2['var3']

actual = xr.merge([data1, data2], compat='no_conflicts')
assert data.equals(actual)


class TestMergeMethod(TestCase):

Expand Down Expand Up @@ -111,7 +156,8 @@ def test_merge_broadcast_equals(self):
def test_merge_compat(self):
ds1 = xr.Dataset({'x': 0})
ds2 = xr.Dataset({'x': 1})
for compat in ['broadcast_equals', 'equals', 'identical']:
for compat in ['broadcast_equals', 'equals', 'identical',
'no_conflicts']:
with self.assertRaises(xr.MergeError):
ds1.merge(ds2, compat=compat)

Expand All @@ -132,7 +178,7 @@ def test_merge_auto_align(self):
ds1 = xr.Dataset({'a': ('x', [1, 2])})
ds2 = xr.Dataset({'b': ('x', [3, 4]), 'x': [1, 2]})
expected = xr.Dataset({'a': ('x', [1, 2, np.nan]),
'b': ('x', [np.nan, 3, 4])})
'b': ('x', [np.nan, 3, 4])})
assert expected.identical(ds1.merge(ds2))
assert expected.identical(ds2.merge(ds1))

Expand All @@ -143,3 +189,29 @@ def test_merge_auto_align(self):
expected = expected.isel(x=slice(1, 2))
assert expected.identical(ds1.merge(ds2, join='inner'))
assert expected.identical(ds2.merge(ds1, join='inner'))

def test_merge_no_conflicts(self):
ds1 = xr.Dataset({'a': ('x', [1, 2])})
ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]})
expected = xr.Dataset({'a': ('x', [1, 2, 3])})

assert expected.identical(ds1.merge(ds2, compat='no_conflicts'))
assert expected.identical(ds2.merge(ds1, compat='no_conflicts'))

assert ds1.identical(ds1.merge(ds2, compat='no_conflicts',
join='left'))

assert ds2.identical(ds1.merge(ds2, compat='no_conflicts',
join='right'))

expected2 = xr.Dataset({'a': ('x', [2]), 'x': [1]})
assert expected2.identical(ds1.merge(ds2, compat='no_conflicts',
join='inner'))

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
ds1.merge(ds3, compat='no_conflicts')

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('y', [2, 3]), 'y': [1, 2]})
ds1.merge(ds3, compat='no_conflicts')
35 changes: 33 additions & 2 deletions xarray/test/test_ops.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from pytest import mark
import numpy as np
from numpy import array, nan
from xarray.core import ops
from xarray.core.ops import (
first, last, count, mean
first, last, count, mean, array_notnull_equiv,
)

from . import TestCase
Expand Down Expand Up @@ -74,3 +74,34 @@ def test_count(self):

def test_all_nan_arrays(self):
assert np.isnan(mean([np.nan, np.nan]))


class TestArrayNotNullEquiv():
@mark.parametrize("arr1, arr2", [
(np.array([1, 2, 3]), np.array([1, 2, 3])),
(np.array([1, 2, np.nan]), np.array([1, np.nan, 3])),
(np.array([np.nan, 2, np.nan]), np.array([1, np.nan, np.nan])),
])
def test_equal(self, arr1, arr2):
assert array_notnull_equiv(arr1, arr2)

def test_some_not_equal(self):
a = np.array([1, 2, 4])
b = np.array([1, np.nan, 3])
assert not array_notnull_equiv(a, b)

def test_wrong_shape(self):
a = np.array([[1, np.nan, np.nan, 4]])
b = np.array([[1, 2], [np.nan, 4]])
assert not array_notnull_equiv(a, b)

@mark.parametrize("val1, val2, val3, null", [
(1, 2, 3, None),
(1., 2., 3., np.nan),
(1., 2., 3., None),
('foo', 'bar', 'baz', None),
])
def test_types(self, val1, val2, val3, null):
arr1 = np.array([val1, null, val3, null])
arr2 = np.array([val1, val2, null, null])
assert array_notnull_equiv(arr1, arr2)
Loading