Skip to content

Commit

Permalink
Refactor update coordinates to better handle multi-coordinate indexes (
Browse files Browse the repository at this point in the history
…pydata#8094)

* generic warning implicitly wrap a pd.MultiIndex

* refactor update_coords (assign)

Fix more cases with multi-coordinate indexes:

- do not try to align existing indexed coordinates with the new
coordinates that will fully replace them

- raise early if the new coordinates would corrupt the existing indexed
coordinates

- isolate PandasMultiIndex special cases so that it will be easier to
drop support for it later (and warn now about deprecation)

* fix alignment of updated coordinates

when DataArray objects are passed as new coordinate objects

* refactor Dataset.assign

Need to update (replace) coordinates and data variables separately to
ensure it goes through all (indexed) coordinate update checks.

* fix and update tests

* nit

* fix mypy?

* update what's new

* fix import error

* fix performance regression

* nit

* use warning util func and improve messages
  • Loading branch information
benbovy authored Aug 29, 2023
1 parent 42d42ba commit 1fedfd8
Show file tree
Hide file tree
Showing 5 changed files with 205 additions and 102 deletions.
11 changes: 9 additions & 2 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ What's New
np.random.seed(123456)
.. _whats-new.2023.08.1:

v2023.08.1 (unreleased)
Expand All @@ -31,10 +30,19 @@ Breaking changes
Deprecations
~~~~~~~~~~~~

- Deprecate passing a :py:class:`pandas.MultiIndex` object directly to the
:py:class:`Dataset` and :py:class:`DataArray` constructors as well as to
:py:meth:`Dataset.assign` and :py:meth:`Dataset.assign_coords`.
A new Xarray :py:class:`Coordinates` object has to be created first using
:py:meth:`Coordinates.from_pandas_multiindex` (:pull:`8094`).
By `Benoît Bovy <https://github.com/benbovy>`_.

Bug fixes
~~~~~~~~~

- Improved handling of multi-coordinate indexes when updating coordinates, including bug fixes
(and improved warnings for deprecated features) for pandas multi-indexes (:pull:`8094`).
By `Benoît Bovy <https://github.com/benbovy>`_.

Documentation
~~~~~~~~~~~~~
Expand Down Expand Up @@ -120,7 +128,6 @@ Breaking changes
numbagg 0.1 0.2.1
===================== ========= ========


Documentation
~~~~~~~~~~~~~

Expand Down
193 changes: 115 additions & 78 deletions xarray/core/coordinates.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import warnings
from collections.abc import Hashable, Iterator, Mapping, Sequence
from contextlib import contextmanager
from typing import (
Expand All @@ -24,7 +23,7 @@
)
from xarray.core.merge import merge_coordinates_without_align, merge_coords
from xarray.core.types import Self, T_DataArray
from xarray.core.utils import Frozen, ReprObject
from xarray.core.utils import Frozen, ReprObject, emit_user_level_warning
from xarray.core.variable import Variable, as_variable, calculate_dimensions

if TYPE_CHECKING:
Expand Down Expand Up @@ -83,7 +82,7 @@ def variables(self):
def _update_coords(self, coords, indexes):
raise NotImplementedError()

def _maybe_drop_multiindex_coords(self, coords):
def _drop_coords(self, coord_names):
raise NotImplementedError()

def __iter__(self) -> Iterator[Hashable]:
Expand Down Expand Up @@ -379,9 +378,9 @@ def _update_coords(
# redirect to DatasetCoordinates._update_coords
self._data.coords._update_coords(coords, indexes)

def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None:
# redirect to DatasetCoordinates._maybe_drop_multiindex_coords
self._data.coords._maybe_drop_multiindex_coords(coords)
def _drop_coords(self, coord_names):
# redirect to DatasetCoordinates._drop_coords
self._data.coords._drop_coords(coord_names)

def _merge_raw(self, other, reflexive):
"""For use with binary arithmetic."""
Expand Down Expand Up @@ -454,22 +453,40 @@ def __setitem__(self, key: Hashable, value: Any) -> None:

def update(self, other: Mapping[Any, Any]) -> None:
"""Update this Coordinates variables with other coordinate variables."""
other_obj: Coordinates | Mapping[Hashable, Variable]

if not len(other):
return

other_coords: Coordinates

if isinstance(other, Coordinates):
# special case: default indexes won't be created
other_obj = other
# Coordinates object: just pass it (default indexes won't be created)
other_coords = other
else:
other_obj = getattr(other, "variables", other)
other_coords = create_coords_with_default_indexes(
getattr(other, "variables", other)
)

self._maybe_drop_multiindex_coords(set(other_obj))
# Discard original indexed coordinates prior to merge allows to:
# - fail early if the new coordinates don't preserve the integrity of existing
# multi-coordinate indexes
# - drop & replace coordinates without alignment (note: we must keep indexed
# coordinates extracted from the DataArray objects passed as values to
# `other` - if any - as those are still used for aligning the old/new coordinates)
coords_to_align = drop_indexed_coords(set(other_coords) & set(other), self)

coords, indexes = merge_coords(
[self.variables, other_obj],
[coords_to_align, other_coords],
priority_arg=1,
indexes=self.xindexes,
indexes=coords_to_align.xindexes,
)

# special case for PandasMultiIndex: updating only its dimension coordinate
# is still allowed but depreciated.
# It is the only case where we need to actually drop coordinates here (multi-index levels)
# TODO: remove when removing PandasMultiIndex's dimension coordinate.
self._drop_coords(self._names - coords_to_align._names)

self._update_coords(coords, indexes)

def _overwrite_indexes(
Expand Down Expand Up @@ -610,15 +627,20 @@ def _update_coords(
original_indexes.update(indexes)
self._data._indexes = original_indexes

def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None:
"""Drops variables in coords, and any associated variables as well."""
def _drop_coords(self, coord_names):
# should drop indexed coordinates only
for name in coord_names:
del self._data._variables[name]
del self._data._indexes[name]
self._data._coord_names.difference_update(coord_names)

def _drop_indexed_coords(self, coords_to_drop: set[Hashable]) -> None:
assert self._data.xindexes is not None
variables, indexes = drop_coords(
coords, self._data._variables, self._data.xindexes
)
self._data._coord_names.intersection_update(variables)
self._data._variables = variables
self._data._indexes = indexes
new_coords = drop_indexed_coords(coords_to_drop, self)
for name in self._data._coord_names - new_coords._names:
del self._data._variables[name]
self._data._indexes = dict(new_coords.xindexes)
self._data._coord_names.intersection_update(new_coords._names)

def __delitem__(self, key: Hashable) -> None:
if key in self:
Expand Down Expand Up @@ -691,13 +713,11 @@ def _update_coords(
original_indexes.update(indexes)
self._data._indexes = original_indexes

def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None:
"""Drops variables in coords, and any associated variables as well."""
variables, indexes = drop_coords(
coords, self._data._coords, self._data.xindexes
)
self._data._coords = variables
self._data._indexes = indexes
def _drop_coords(self, coord_names):
# should drop indexed coordinates only
for name in coord_names:
del self._data._coords[name]
del self._data._indexes[name]

@property
def variables(self):
Expand All @@ -724,35 +744,48 @@ def _ipython_key_completions_(self):
return self._data._ipython_key_completions_()


def drop_coords(
coords_to_drop: set[Hashable], variables, indexes: Indexes
) -> tuple[dict, dict]:
"""Drop index variables associated with variables in coords_to_drop."""
# Only warn when we're dropping the dimension with the multi-indexed coordinate
# If asked to drop a subset of the levels in a multi-index, we raise an error
# later but skip the warning here.
new_variables = dict(variables.copy())
new_indexes = dict(indexes.copy())
for key in coords_to_drop & set(indexes):
maybe_midx = indexes[key]
idx_coord_names = set(indexes.get_all_coords(key))
if (
isinstance(maybe_midx, PandasMultiIndex)
and key == maybe_midx.dim
and (idx_coord_names - coords_to_drop)
):
warnings.warn(
f"Updating MultiIndexed coordinate {key!r} would corrupt indices for "
f"other variables: {list(maybe_midx.index.names)!r}. "
f"This will raise an error in the future. Use `.drop_vars({idx_coord_names!r})` before "
def drop_indexed_coords(
coords_to_drop: set[Hashable], coords: Coordinates
) -> Coordinates:
"""Drop indexed coordinates associated with coordinates in coords_to_drop.
This will raise an error in case it corrupts any passed index and its
coordinate variables.
"""
new_variables = dict(coords.variables)
new_indexes = dict(coords.xindexes)

for idx, idx_coords in coords.xindexes.group_by_index():
idx_drop_coords = set(idx_coords) & coords_to_drop

# special case for pandas multi-index: still allow but deprecate
# dropping only its dimension coordinate.
# TODO: remove when removing PandasMultiIndex's dimension coordinate.
if isinstance(idx, PandasMultiIndex) and idx_drop_coords == {idx.dim}:
idx_drop_coords.update(idx.index.names)
emit_user_level_warning(
f"updating coordinate {idx.dim!r} with a PandasMultiIndex would leave "
f"the multi-index level coordinates {list(idx.index.names)!r} in an inconsistent state. "
f"This will raise an error in the future. Use `.drop_vars({list(idx_coords)!r})` before "
"assigning new coordinate values.",
FutureWarning,
stacklevel=4,
)
for k in idx_coord_names:
del new_variables[k]
del new_indexes[k]
return new_variables, new_indexes

elif idx_drop_coords and len(idx_drop_coords) != len(idx_coords):
idx_drop_coords_str = ", ".join(f"{k!r}" for k in idx_drop_coords)
idx_coords_str = ", ".join(f"{k!r}" for k in idx_coords)
raise ValueError(
f"cannot drop or update coordinate(s) {idx_drop_coords_str}, which would corrupt "
f"the following index built from coordinates {idx_coords_str}:\n"
f"{idx}"
)

for k in idx_drop_coords:
del new_variables[k]
del new_indexes[k]

return Coordinates._construct_direct(coords=new_variables, indexes=new_indexes)


def assert_coordinate_consistent(
Expand All @@ -773,11 +806,15 @@ def assert_coordinate_consistent(


def create_coords_with_default_indexes(
coords: Mapping[Any, Any], data_vars: Mapping[Any, Variable] | None = None
coords: Mapping[Any, Any], data_vars: Mapping[Any, Any] | None = None
) -> Coordinates:
"""Maybe create default indexes from a mapping of coordinates."""
"""Returns a Coordinates object from a mapping of coordinates (arbitrary objects).
Create default (pandas) indexes for each of the input dimension coordinates.
Extract coordinates from each input DataArray.
# Note: data_vars are needed here only because a pd.MultiIndex object
"""
# Note: data_vars is needed here only because a pd.MultiIndex object
# can be promoted as coordinates.
# TODO: It won't be relevant anymore when this behavior will be dropped
# in favor of the more explicit ``Coordinates.from_pandas_multiindex()``.
Expand All @@ -791,34 +828,34 @@ def create_coords_with_default_indexes(
indexes: dict[Hashable, Index] = {}
variables: dict[Hashable, Variable] = {}

maybe_index_vars: dict[Hashable, Variable] = {}
mindex_data_vars: list[Hashable] = []
# promote any pandas multi-index in data_vars as coordinates
coords_promoted: dict[Hashable, Any] = {}
pd_mindex_keys: list[Hashable] = []

for k, v in all_variables.items():
if k in coords:
maybe_index_vars[k] = v
elif isinstance(v, pd.MultiIndex):
# TODO: eventually stop promoting multi-index passed via data variables
mindex_data_vars.append(k)
maybe_index_vars[k] = v

if mindex_data_vars:
warnings.warn(
f"passing one or more `pandas.MultiIndex` via data variable(s) {mindex_data_vars} "
"will no longer create indexed coordinates in the future. "
"If you want to keep this behavior, pass it as coordinates instead.",
if isinstance(v, pd.MultiIndex):
coords_promoted[k] = v
pd_mindex_keys.append(k)
elif k in coords:
coords_promoted[k] = v

if pd_mindex_keys:
pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys])
emit_user_level_warning(
f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or "
"data variable(s) will no longer be implicitly promoted and wrapped into "
"multiple indexed coordinates in the future "
"(i.e., one coordinate for each multi-index level + one dimension coordinate). "
"If you want to keep this behavior, you need to first wrap it explicitly using "
"`mindex_coords = xarray.Coordinates.from_pandas_multiindex(mindex_obj, 'dim')` "
"and pass it as coordinates, e.g., `xarray.Dataset(coords=mindex_coords)`, "
"`dataset.assign_coords(mindex_coords)` or `dataarray.assign_coords(mindex_coords)`.",
FutureWarning,
)

maybe_index_vars = {
k: v
for k, v in all_variables.items()
if k in coords or isinstance(v, pd.MultiIndex)
}

dataarray_coords: list[DataArrayCoordinates] = []

for name, obj in maybe_index_vars.items():
for name, obj in coords_promoted.items():
if isinstance(obj, DataArray):
dataarray_coords.append(obj.coords)

Expand Down
22 changes: 19 additions & 3 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes
from xarray.core.dataarray import DataArray
from xarray.core.groupby import DatasetGroupBy
from xarray.core.merge import CoercibleMapping
from xarray.core.merge import CoercibleMapping, CoercibleValue
from xarray.core.parallelcompat import ChunkManagerEntrypoint
from xarray.core.resample import DatasetResample
from xarray.core.rolling import DatasetCoarsen, DatasetRolling
Expand Down Expand Up @@ -6879,6 +6879,10 @@ def assign(
possible, but you cannot reference other variables created within the
same ``assign`` call.
The new assigned variables that replace existing coordinates in the
original dataset are still listed as coordinates in the returned
Dataset.
See Also
--------
pandas.DataFrame.assign
Expand Down Expand Up @@ -6934,11 +6938,23 @@ def assign(
"""
variables = either_dict_or_kwargs(variables, variables_kwargs, "assign")
data = self.copy()

# do all calculations first...
results: CoercibleMapping = data._calc_assign_results(variables)
data.coords._maybe_drop_multiindex_coords(set(results.keys()))

# split data variables to add/replace vs. coordinates to replace
results_data_vars: dict[Hashable, CoercibleValue] = {}
results_coords: dict[Hashable, CoercibleValue] = {}
for k, v in results.items():
if k in data._coord_names:
results_coords[k] = v
else:
results_data_vars[k] = v

# ... and then assign
data.update(results)
data.coords.update(results_coords)
data.update(results_data_vars)

return data

def to_array(
Expand Down
Loading

0 comments on commit 1fedfd8

Please sign in to comment.