Merge branch 'main' into grouper-public-api

pydata · Jul 11, 2024 · 12c4316 · 12c4316
2 parents 5938e67 + e12aa44
commit 12c4316
Show file tree

Hide file tree

Showing 16 changed files with 254 additions and 112 deletions.
diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh
@@ -13,7 +13,7 @@ $conda remove -y numba numbagg sparse
 # temporarily remove numexpr
 $conda remove -y numexpr
 # temporarily remove backends
-$conda remove -y cf_units hdf5 h5py netcdf4 pydap
+$conda remove -y pydap
 # forcibly remove packages to avoid artifacts
 $conda remove -y --force \
     numpy \
@@ -37,8 +37,7 @@ python -m pip install \
     numpy \
     scipy \
     matplotlib \
-    pandas \
-    h5py
+    pandas
 # for some reason pandas depends on pyarrow already.
 # Remove once a `pyarrow` version compiled with `numpy>=2.0` is on `conda-forge`
 python -m pip install \

diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml
@@ -22,7 +22,7 @@ dependencies:
   - netcdf4
   - numba
   - numbagg
-  - numpy<2
+  - numpy
   - packaging
   - pandas
   - pint>=0.22

diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml
@@ -23,7 +23,7 @@ dependencies:
   - netcdf4
   - numba
   - numbagg
-  - numpy<2
+  - numpy
   - packaging
   - pandas
   # - pint>=0.22

diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
@@ -26,7 +26,7 @@ dependencies:
   - numba
   - numbagg
   - numexpr
-  - numpy<2
+  - numpy
   - opt_einsum
   - packaging
   - pandas

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -37,14 +37,24 @@ Deprecations
 
 Bug fixes
 ~~~~~~~~~
+- Fix scatter plot broadcasting unneccesarily. (:issue:`9129`, :pull:`9206`)
+  By `Jimmy Westling <https://github.com/illviljan>`_.
 - Don't convert custom indexes to ``pandas`` indexes when computing a diff (:pull:`9157`)
   By `Justus Magin <https://github.com/keewis>`_.
 - Make :py:func:`testing.assert_allclose` work with numpy 2.0 (:issue:`9165`, :pull:`9166`).
   By `Pontus Lurcock <https://github.com/pont-us>`_.
 - Allow diffing objects with array attributes on variables (:issue:`9153`, :pull:`9169`).
   By `Justus Magin <https://github.com/keewis>`_.
+- ``numpy>=2`` compatibility in the ``netcdf4`` backend (:pull:`9136`).
+  By `Justus Magin <https://github.com/keewis>`_ and `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 - Promote floating-point numeric datetimes before decoding (:issue:`9179`, :pull:`9182`).
   By `Justus Magin <https://github.com/keewis>`_.
+- Address regression introduced in :pull:`9002` that prevented objects returned
+  by py:meth:`DataArray.convert_calendar` to be indexed by a time index in
+  certain circumstances (:issue:`9138`, :pull:`9192`). By `Mark Harfouche
+  <https://github.com/hmaarrfk>`_ and `Spencer Clark
+  <https://github.com/spencerkclark>`.
+
 - Fiy static typing of tolerance arguments by allowing `str` type (:issue:`8892`, :pull:`9194`).
   By `Michael Niklas <https://github.com/headtr1ck>`_.
 - Dark themes are now properly detected for ``html[data-theme=dark]``-tags (:pull:`9200`).
@@ -59,7 +69,7 @@ Documentation
 - Adds a flow-chart diagram to help users navigate help resources (`Discussion #8990 <https://github.com/pydata/xarray/discussions/8990>`_).
   By `Jessica Scheick <https://github.com/jessicas11>`_.
 - Improvements to Zarr & chunking docs (:pull:`9139`, :pull:`9140`, :pull:`9132`)
-  By `Maximilian Roos <https://github.com/max-sixty>`_
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 
 
 Internal Changes

diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py
@@ -5,7 +5,10 @@
 
 from xarray.coding.cftime_offsets import date_range_like, get_date_type
 from xarray.coding.cftimeindex import CFTimeIndex
-from xarray.coding.times import _should_cftime_be_used, convert_times
+from xarray.coding.times import (
+    _should_cftime_be_used,
+    convert_times,
+)
 from xarray.core.common import _contains_datetime_like_objects, is_np_datetime_like
 
 try:
@@ -222,6 +225,13 @@ def convert_calendar(
         # Remove NaN that where put on invalid dates in target calendar
         out = out.where(out[dim].notnull(), drop=True)
 
+        if use_cftime:
+            # Reassign times to ensure time index of output is a CFTimeIndex
+            # (previously it was an Index due to the presence of NaN values).
+            # Note this is not needed in the case that the output time index is
+            # a DatetimeIndex, since DatetimeIndexes can handle NaN values.
+            out[dim] = CFTimeIndex(out[dim].data)
+
     if missing is not None:
         time_target = date_range_like(time, calendar=calendar, use_cftime=use_cftime)
         out = out.reindex({dim: time_target}, fill_value=missing)

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -22,7 +22,7 @@
 )
 from xarray.core import indexing
 from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
-from xarray.core.duck_array_ops import asarray
+from xarray.core.duck_array_ops import asarray, ravel, reshape
 from xarray.core.formatting import first_n_items, format_timestamp, last_item
 from xarray.core.pdcompat import nanosecond_precision_timestamp
 from xarray.core.utils import emit_user_level_warning
@@ -315,7 +315,7 @@ def decode_cf_datetime(
     cftime.num2date
     """
     num_dates = np.asarray(num_dates)
-    flat_num_dates = num_dates.ravel()
+    flat_num_dates = ravel(num_dates)
     if calendar is None:
         calendar = "standard"
 
@@ -348,7 +348,7 @@ def decode_cf_datetime(
     else:
         dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar)
 
-    return dates.reshape(num_dates.shape)
+    return reshape(dates, num_dates.shape)
 
 
 def to_timedelta_unboxed(value, **kwargs):
@@ -369,8 +369,8 @@ def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray:
     """
     num_timedeltas = np.asarray(num_timedeltas)
     units = _netcdf_to_numpy_timeunit(units)
-    result = to_timedelta_unboxed(num_timedeltas.ravel(), unit=units)
-    return result.reshape(num_timedeltas.shape)
+    result = to_timedelta_unboxed(ravel(num_timedeltas), unit=units)
+    return reshape(result, num_timedeltas.shape)
 
 
 def _unit_timedelta_cftime(units: str) -> timedelta:
@@ -428,7 +428,7 @@ def infer_datetime_units(dates) -> str:
     'hours', 'minutes' or 'seconds' (the first one that can evenly divide all
     unique time deltas in `dates`)
     """
-    dates = np.asarray(dates).ravel()
+    dates = ravel(np.asarray(dates))
     if np.asarray(dates).dtype == "datetime64[ns]":
         dates = to_datetime_unboxed(dates)
         dates = dates[pd.notnull(dates)]
@@ -456,7 +456,7 @@ def infer_timedelta_units(deltas) -> str:
     {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly
     divide all unique time deltas in `deltas`)
     """
-    deltas = to_timedelta_unboxed(np.asarray(deltas).ravel())
+    deltas = to_timedelta_unboxed(ravel(np.asarray(deltas)))
     unique_timedeltas = np.unique(deltas[pd.notnull(deltas)])
     return _infer_time_units_from_diff(unique_timedeltas)
 
@@ -643,7 +643,7 @@ def encode_datetime(d):
         except TypeError:
             return np.nan if d is None else cftime.date2num(d, units, calendar)
 
-    return np.array([encode_datetime(d) for d in dates.ravel()]).reshape(dates.shape)
+    return reshape(np.array([encode_datetime(d) for d in ravel(dates)]), dates.shape)
 
 
 def cast_to_int_if_safe(num) -> np.ndarray:
@@ -753,7 +753,7 @@ def _eagerly_encode_cf_datetime(
         # Wrap the dates in a DatetimeIndex to do the subtraction to ensure
         # an OverflowError is raised if the ref_date is too far away from
         # dates to be encoded (GH 2272).
-        dates_as_index = pd.DatetimeIndex(dates.ravel())
+        dates_as_index = pd.DatetimeIndex(ravel(dates))
         time_deltas = dates_as_index - ref_date
 
         # retrieve needed units to faithfully encode to int64
@@ -791,7 +791,7 @@ def _eagerly_encode_cf_datetime(
                 floor_division = True
 
         num = _division(time_deltas, time_delta, floor_division)
-        num = num.values.reshape(dates.shape)
+        num = reshape(num.values, dates.shape)
 
     except (OutOfBoundsDatetime, OverflowError, ValueError):
         num = _encode_datetime_with_cftime(dates, units, calendar)
@@ -879,7 +879,7 @@ def _eagerly_encode_cf_timedelta(
         units = data_units
 
     time_delta = _time_units_to_timedelta64(units)
-    time_deltas = pd.TimedeltaIndex(timedeltas.ravel())
+    time_deltas = pd.TimedeltaIndex(ravel(timedeltas))
 
     # retrieve needed units to faithfully encode to int64
     needed_units = data_units
@@ -911,7 +911,7 @@ def _eagerly_encode_cf_timedelta(
             floor_division = True
 
     num = _division(time_deltas, time_delta, floor_division)
-    num = num.values.reshape(timedeltas.shape)
+    num = reshape(num.values, timedeltas.shape)
 
     if dtype is not None:
         num = _cast_to_dtype_if_safe(num, dtype)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -516,10 +516,13 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
             dims, data, attrs, encoding = unpack_for_encoding(variable)
 
             pop_to(encoding, attrs, "_Unsigned")
-            signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
+            # we need the on-disk type here
+            # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
+            signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
             if "_FillValue" in attrs:
-                new_fill = signed_dtype.type(attrs["_FillValue"])
-                attrs["_FillValue"] = new_fill
+                new_fill = np.array(attrs["_FillValue"])
+                # use view here to prevent OverflowError
+                attrs["_FillValue"] = new_fill.view(signed_dtype).item()
             data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
 
             return Variable(dims, data, attrs, encoding, fastpath=True)
@@ -535,10 +538,11 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
                 if unsigned == "true":
                     unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
                     transform = partial(np.asarray, dtype=unsigned_dtype)
-                    data = lazy_elemwise_func(data, transform, unsigned_dtype)
                     if "_FillValue" in attrs:
-                        new_fill = unsigned_dtype.type(attrs["_FillValue"])
-                        attrs["_FillValue"] = new_fill
+                        new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
+                        # use view here to prevent OverflowError
+                        attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
+                    data = lazy_elemwise_func(data, transform, unsigned_dtype)
             elif data.dtype.kind == "u":
                 if unsigned == "false":
                     signed_dtype = np.dtype(f"i{data.dtype.itemsize}")

diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py
@@ -61,7 +61,7 @@
     import pandas as pd
 
     from xarray.core.datatree_io import T_DataTreeNetcdfEngine, T_DataTreeNetcdfTypes
-    from xarray.core.merge import CoercibleValue
+    from xarray.core.merge import CoercibleMapping, CoercibleValue
     from xarray.core.types import ErrorOptions, NetcdfWriteModes, ZarrWriteModes
 
 # """
@@ -954,23 +954,29 @@ def update(
 
         Just like `dict.update` this is an in-place operation.
         """
-        # TODO separate by type
         new_children: dict[str, DataTree] = {}
-        new_variables = {}
-        for k, v in other.items():
-            if isinstance(v, DataTree):
-                # avoid named node being stored under inconsistent key
-                new_child: DataTree = v.copy()
-                # Datatree's name is always a string until we fix that (#8836)
-                new_child.name = str(k)
-                new_children[str(k)] = new_child
-            elif isinstance(v, (DataArray, Variable)):
-                # TODO this should also accommodate other types that can be coerced into Variables
-                new_variables[k] = v
-            else:
-                raise TypeError(f"Type {type(v)} cannot be assigned to a DataTree")
-
-        vars_merge_result = dataset_update_method(self.to_dataset(), new_variables)
+        new_variables: CoercibleMapping
+
+        if isinstance(other, Dataset):
+            new_variables = other
+        else:
+            new_variables = {}
+            for k, v in other.items():
+                if isinstance(v, DataTree):
+                    # avoid named node being stored under inconsistent key
+                    new_child: DataTree = v.copy()
+                    # Datatree's name is always a string until we fix that (#8836)
+                    new_child.name = str(k)
+                    new_children[str(k)] = new_child
+                elif isinstance(v, (DataArray, Variable)):
+                    # TODO this should also accommodate other types that can be coerced into Variables
+                    new_variables[k] = v
+                else:
+                    raise TypeError(f"Type {type(v)} cannot be assigned to a DataTree")
+
+        vars_merge_result = dataset_update_method(
+            self.to_dataset(inherited=False), new_variables
+        )
         data = Dataset._construct_direct(**vars_merge_result._asdict())
 
         # TODO are there any subtleties with preserving order of children like this?

diff --git a/xarray/plot/dataset_plot.py b/xarray/plot/dataset_plot.py
@@ -721,8 +721,8 @@ def _temp_dataarray(ds: Dataset, y: Hashable, locals_: dict[str, Any]) -> DataAr
     """Create a temporary datarray with extra coords."""
     from xarray.core.dataarray import DataArray
 
-    # Base coords:
-    coords = dict(ds.coords)
+    coords = dict(ds[y].coords)
+    dims = set(ds[y].dims)
 
     # Add extra coords to the DataArray from valid kwargs, if using all
     # kwargs there is a risk that we add unnecessary dataarrays as
@@ -732,12 +732,17 @@ def _temp_dataarray(ds: Dataset, y: Hashable, locals_: dict[str, Any]) -> DataAr
     coord_kwargs = locals_.keys() & valid_coord_kwargs
     for k in coord_kwargs:
         key = locals_[k]
-        if ds.data_vars.get(key) is not None:
-            coords[key] = ds[key]
+        darray = ds.get(key)
+        if darray is not None:
+            coords[key] = darray
+            dims.update(darray.dims)
+
+    # Trim dataset from unneccessary dims:
+    ds_trimmed = ds.drop_dims(ds.sizes.keys() - dims)  # TODO: Use ds.dims in the future
 
     # The dataarray has to include all the dims. Broadcast to that shape
     # and add the additional coords:
-    _y = ds[y].broadcast_like(ds)
+    _y = ds[y].broadcast_like(ds_trimmed)
 
     return DataArray(_y, coords=coords)
 

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -166,7 +166,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset:
 
 def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
     encoding = {
-        "_FillValue": 255,
+        "_FillValue": np.int8(-1),
         "_Unsigned": "true",
         "dtype": "i1",
         "add_offset": dtype.type(10),
@@ -925,6 +925,35 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
                 assert decoded.variables[k].dtype == actual.variables[k].dtype
             assert_allclose(decoded, actual, decode_bytes=False)
 
+    @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255)])
+    def test_roundtrip_unsigned(self, fillvalue):
+        # regression/numpy2 test for
+        encoding = {
+            "_FillValue": fillvalue,
+            "_Unsigned": "true",
+            "dtype": "i1",
+        }
+        x = np.array([0, 1, 127, 128, 254, np.nan], dtype=np.float32)
+        decoded = Dataset({"x": ("t", x, {}, encoding)})
+
+        attributes = {
+            "_FillValue": fillvalue,
+            "_Unsigned": "true",
+        }
+        # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
+        sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1")
+        encoded = Dataset({"x": ("t", sb, attributes)})
+
+        with self.roundtrip(decoded) as actual:
+            for k in decoded.variables:
+                assert decoded.variables[k].dtype == actual.variables[k].dtype
+            assert_allclose(decoded, actual, decode_bytes=False)
+
+        with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
+            for k in encoded.variables:
+                assert encoded.variables[k].dtype == actual.variables[k].dtype
+            assert_allclose(encoded, actual, decode_bytes=False)
+
     @staticmethod
     def _create_cf_dataset():
         original = Dataset(
@@ -4285,7 +4314,7 @@ def test_roundtrip_coordinates_with_space(self) -> None:
     def test_roundtrip_numpy_datetime_data(self) -> None:
         # Override method in DatasetIOBase - remove not applicable
         # save_kwargs
-        times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"])
+        times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"], unit="ns")
         expected = Dataset({"t": ("t", times), "t0": times[0]})
         with self.roundtrip(expected) as actual:
             assert_identical(expected, actual)
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,7 +22,7 @@ dependencies: @@
       - netcdf4
       - numba
       - numbagg
-      - numpy<2
+      - numpy
       - packaging
       - pandas
       - pint>=0.22
@@ Expand Down @@