From bac01c05a6e299ca61677af47d0bcb841de1787a Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 8 Jul 2024 02:49:13 -0700 Subject: [PATCH 1/9] Use numpy 2.0-compat `np.complex64` dtype in test (#9217) --- xarray/tests/test_computation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 4b9b95b27bb..b000de311af 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -2602,7 +2602,7 @@ def test_cross(a, b, ae, be, dim: str, axis: int, use_dask: bool) -> None: @pytest.mark.parametrize("compute_backend", ["numbagg"], indirect=True) def test_complex_number_reduce(compute_backend): - da = xr.DataArray(np.ones((2,), dtype=np.complex_), dims=["x"]) + da = xr.DataArray(np.ones((2,), dtype=np.complex64), dims=["x"]) # Check that xarray doesn't call into numbagg, which doesn't compile for complex # numbers at the moment (but will when numba supports dynamic compilation) da.min() From 179c6706615854fc861335d929bd6c4761485a93 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 8 Jul 2024 10:12:54 -0700 Subject: [PATCH 2/9] Fix two bugs in DataTree.update() (#9214) * Fix two bugs in DataTree.update() 1. Fix handling of coordinates on a Dataset argument (previously these were silently dropped). 2. Do not copy inherited coordinates down to lower level nodes. * add mypy annotation --- xarray/core/datatree.py | 40 ++++++++++++++++++++--------------- xarray/tests/test_datatree.py | 37 ++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 38f8f8cd495..65ff8667cb7 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -61,7 +61,7 @@ import pandas as pd from xarray.core.datatree_io import T_DataTreeNetcdfEngine, T_DataTreeNetcdfTypes - from xarray.core.merge import CoercibleValue + from xarray.core.merge import CoercibleMapping, CoercibleValue from xarray.core.types import ErrorOptions, NetcdfWriteModes, ZarrWriteModes # """ @@ -954,23 +954,29 @@ def update( Just like `dict.update` this is an in-place operation. """ - # TODO separate by type new_children: dict[str, DataTree] = {} - new_variables = {} - for k, v in other.items(): - if isinstance(v, DataTree): - # avoid named node being stored under inconsistent key - new_child: DataTree = v.copy() - # Datatree's name is always a string until we fix that (#8836) - new_child.name = str(k) - new_children[str(k)] = new_child - elif isinstance(v, (DataArray, Variable)): - # TODO this should also accommodate other types that can be coerced into Variables - new_variables[k] = v - else: - raise TypeError(f"Type {type(v)} cannot be assigned to a DataTree") - - vars_merge_result = dataset_update_method(self.to_dataset(), new_variables) + new_variables: CoercibleMapping + + if isinstance(other, Dataset): + new_variables = other + else: + new_variables = {} + for k, v in other.items(): + if isinstance(v, DataTree): + # avoid named node being stored under inconsistent key + new_child: DataTree = v.copy() + # Datatree's name is always a string until we fix that (#8836) + new_child.name = str(k) + new_children[str(k)] = new_child + elif isinstance(v, (DataArray, Variable)): + # TODO this should also accommodate other types that can be coerced into Variables + new_variables[k] = v + else: + raise TypeError(f"Type {type(v)} cannot be assigned to a DataTree") + + vars_merge_result = dataset_update_method( + self.to_dataset(inherited=False), new_variables + ) data = Dataset._construct_direct(**vars_merge_result._asdict()) # TODO are there any subtleties with preserving order of children like this? diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index f2b58fa2489..f7cff17bab5 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -244,11 +244,6 @@ def test_update(self): dt: DataTree = DataTree() dt.update({"foo": xr.DataArray(0), "a": DataTree()}) expected = DataTree.from_dict({"/": xr.Dataset({"foo": 0}), "a": None}) - print(dt) - print(dt.children) - print(dt._children) - print(dt["a"]) - print(expected) assert_equal(dt, expected) def test_update_new_named_dataarray(self): @@ -268,14 +263,38 @@ def test_update_doesnt_alter_child_name(self): def test_update_overwrite(self): actual = DataTree.from_dict({"a": DataTree(xr.Dataset({"x": 1}))}) actual.update({"a": DataTree(xr.Dataset({"x": 2}))}) - expected = DataTree.from_dict({"a": DataTree(xr.Dataset({"x": 2}))}) + assert_equal(actual, expected) - print(actual) - print(expected) - + def test_update_coordinates(self): + expected = DataTree.from_dict({"/": xr.Dataset(coords={"a": 1})}) + actual = DataTree.from_dict({"/": xr.Dataset()}) + actual.update(xr.Dataset(coords={"a": 1})) assert_equal(actual, expected) + def test_update_inherited_coords(self): + expected = DataTree.from_dict( + { + "/": xr.Dataset(coords={"a": 1}), + "/b": xr.Dataset(coords={"c": 1}), + } + ) + actual = DataTree.from_dict( + { + "/": xr.Dataset(coords={"a": 1}), + "/b": xr.Dataset(), + } + ) + actual["/b"].update(xr.Dataset(coords={"c": 1})) + assert_identical(actual, expected) + + # DataTree.identical() currently does not require that non-inherited + # coordinates are defined identically, so we need to check this + # explicitly + actual_node = actual.children["b"].to_dataset(inherited=False) + expected_node = expected.children["b"].to_dataset(inherited=False) + assert_identical(actual_node, expected_node) + class TestCopy: def test_copy(self, create_test_datatree): From 3024655e3689c11908d221913cdb922bcfc69037 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Tue, 9 Jul 2024 09:09:18 +0200 Subject: [PATCH 3/9] Only use necessary dims when creating temporary dataarray (#9206) * Only use necessary dims when creating temporary dataarray * Update dataset_plot.py * Can't check only data_vars all corrds are no longer added by default * Update dataset_plot.py * Add tests * Update whats-new.rst * Update dataset_plot.py --- doc/whats-new.rst | 2 ++ xarray/plot/dataset_plot.py | 15 +++++++++----- xarray/tests/test_plot.py | 40 +++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8c6b3a099c2..0c401c2348e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -37,6 +37,8 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix scatter plot broadcasting unneccesarily. (:issue:`9129`, :pull:`9206`) + By `Jimmy Westling `_. - Don't convert custom indexes to ``pandas`` indexes when computing a diff (:pull:`9157`) By `Justus Magin `_. - Make :py:func:`testing.assert_allclose` work with numpy 2.0 (:issue:`9165`, :pull:`9166`). diff --git a/xarray/plot/dataset_plot.py b/xarray/plot/dataset_plot.py index edc2bf43629..96b59f6174e 100644 --- a/xarray/plot/dataset_plot.py +++ b/xarray/plot/dataset_plot.py @@ -721,8 +721,8 @@ def _temp_dataarray(ds: Dataset, y: Hashable, locals_: dict[str, Any]) -> DataAr """Create a temporary datarray with extra coords.""" from xarray.core.dataarray import DataArray - # Base coords: - coords = dict(ds.coords) + coords = dict(ds[y].coords) + dims = set(ds[y].dims) # Add extra coords to the DataArray from valid kwargs, if using all # kwargs there is a risk that we add unnecessary dataarrays as @@ -732,12 +732,17 @@ def _temp_dataarray(ds: Dataset, y: Hashable, locals_: dict[str, Any]) -> DataAr coord_kwargs = locals_.keys() & valid_coord_kwargs for k in coord_kwargs: key = locals_[k] - if ds.data_vars.get(key) is not None: - coords[key] = ds[key] + darray = ds.get(key) + if darray is not None: + coords[key] = darray + dims.update(darray.dims) + + # Trim dataset from unneccessary dims: + ds_trimmed = ds.drop_dims(ds.sizes.keys() - dims) # TODO: Use ds.dims in the future # The dataarray has to include all the dims. Broadcast to that shape # and add the additional coords: - _y = ds[y].broadcast_like(ds) + _y = ds[y].broadcast_like(ds_trimmed) return DataArray(_y, coords=coords) diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index b302ad3af93..fa08e9975ab 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -3416,3 +3416,43 @@ def test_9155() -> None: data = xr.DataArray([1, 2, 3], dims=["x"]) fig, ax = plt.subplots(ncols=1, nrows=1) data.plot(ax=ax) + + +@requires_matplotlib +def test_temp_dataarray() -> None: + from xarray.plot.dataset_plot import _temp_dataarray + + x = np.arange(1, 4) + y = np.arange(4, 6) + var1 = np.arange(x.size * y.size).reshape((x.size, y.size)) + var2 = np.arange(x.size * y.size).reshape((x.size, y.size)) + ds = xr.Dataset( + { + "var1": (["x", "y"], var1), + "var2": (["x", "y"], 2 * var2), + "var3": (["x"], 3 * x), + }, + coords={ + "x": x, + "y": y, + "model": np.arange(7), + }, + ) + + # No broadcasting: + y_ = "var1" + locals_ = {"x": "var2"} + da = _temp_dataarray(ds, y_, locals_) + assert da.shape == (3, 2) + + # Broadcast from 1 to 2dim: + y_ = "var3" + locals_ = {"x": "var1"} + da = _temp_dataarray(ds, y_, locals_) + assert da.shape == (3, 2) + + # Ignore non-valid coord kwargs: + y_ = "var3" + locals_ = dict(x="x", extend="var2") + da = _temp_dataarray(ds, y_, locals_) + assert da.shape == (3,) From 879b06b06fe2a08dcc1104761b32a31b64b7d74b Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Wed, 10 Jul 2024 09:34:48 +0200 Subject: [PATCH 4/9] Cleanup test_coding_times.py (#9223) * Cleanup test_coding_times * Update test_coding_times.py --- xarray/tests/test_coding_times.py | 93 +++++++++++++++---------------- 1 file changed, 44 insertions(+), 49 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 393f8400c46..d568bdc3268 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -14,13 +14,15 @@ Dataset, Variable, cftime_range, - coding, conventions, date_range, decode_cf, ) +from xarray.coding.times import _STANDARD_CALENDARS as _STANDARD_CALENDARS_UNSORTED from xarray.coding.times import ( + CFDatetimeCoder, _encode_datetime_with_cftime, + _netcdf_to_numpy_timeunit, _numpy_to_netcdf_timeunit, _should_cftime_be_used, cftime_to_nptime, @@ -28,6 +30,9 @@ decode_cf_timedelta, encode_cf_datetime, encode_cf_timedelta, + format_cftime_datetime, + infer_datetime_units, + infer_timedelta_units, to_timedelta_unboxed, ) from xarray.coding.variables import SerializationWarning @@ -53,11 +58,9 @@ "all_leap", "366_day", } -_ALL_CALENDARS = sorted( - _NON_STANDARD_CALENDARS_SET.union(coding.times._STANDARD_CALENDARS) -) +_STANDARD_CALENDARS = sorted(_STANDARD_CALENDARS_UNSORTED) +_ALL_CALENDARS = sorted(_NON_STANDARD_CALENDARS_SET.union(_STANDARD_CALENDARS)) _NON_STANDARD_CALENDARS = sorted(_NON_STANDARD_CALENDARS_SET) -_STANDARD_CALENDARS = sorted(coding.times._STANDARD_CALENDARS) _CF_DATETIME_NUM_DATES_UNITS = [ (np.arange(10), "days since 2000-01-01"), (np.arange(10).astype("float64"), "days since 2000-01-01"), @@ -130,7 +133,7 @@ def test_cf_datetime(num_dates, units, calendar) -> None: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = coding.times.decode_cf_datetime(num_dates, units, calendar) + actual = decode_cf_datetime(num_dates, units, calendar) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -139,17 +142,15 @@ def test_cf_datetime(num_dates, units, calendar) -> None: # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 assert (abs_diff <= np.timedelta64(1, "s")).all() - encoded, _, _ = coding.times.encode_cf_datetime(actual, units, calendar) + encoded, _, _ = encode_cf_datetime(actual, units, calendar) - assert_array_equal(num_dates, np.around(encoded, 1)) + assert_array_equal(num_dates, np.round(encoded, 1)) if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units: # verify that wrapping with a pandas.Index works # note that it *does not* currently work to put # non-datetime64 compatible dates into a pandas.Index - encoded, _, _ = coding.times.encode_cf_datetime( - pd.Index(actual), units, calendar - ) - assert_array_equal(num_dates, np.around(encoded, 1)) + encoded, _, _ = encode_cf_datetime(pd.Index(actual), units, calendar) + assert_array_equal(num_dates, np.round(encoded, 1)) @requires_cftime @@ -169,7 +170,7 @@ def test_decode_cf_datetime_overflow() -> None: for i, day in enumerate(days): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - result = coding.times.decode_cf_datetime(day, units) + result = decode_cf_datetime(day, units) assert result == expected[i] @@ -178,7 +179,7 @@ def test_decode_cf_datetime_non_standard_units() -> None: # netCDFs from madis.noaa.gov use this format for their time units # they cannot be parsed by cftime, but pd.Timestamp works units = "hours since 1-1-1970" - actual = coding.times.decode_cf_datetime(np.arange(100), units) + actual = decode_cf_datetime(np.arange(100), units) assert_array_equal(actual, expected) @@ -193,7 +194,7 @@ def test_decode_cf_datetime_non_iso_strings() -> None: (np.arange(100), "hours since 2000-01-01 0:00"), ] for num_dates, units in cases: - actual = coding.times.decode_cf_datetime(num_dates, units) + actual = decode_cf_datetime(num_dates, units) abs_diff = abs(actual - expected.values) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: @@ -212,7 +213,7 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: expected = times.values expected_dtype = np.dtype("M8[ns]") - actual = coding.times.decode_cf_datetime(time, units, calendar=calendar) + actual = decode_cf_datetime(time, units, calendar=calendar) assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, @@ -235,9 +236,7 @@ def test_decode_non_standard_calendar_inside_timestamp_range(calendar) -> None: ) expected_dtype = np.dtype("O") - actual = coding.times.decode_cf_datetime( - non_standard_time, units, calendar=calendar - ) + actual = decode_cf_datetime(non_standard_time, units, calendar=calendar) assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, @@ -264,7 +263,7 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = coding.times.decode_cf_datetime(time, units, calendar=calendar) + actual = decode_cf_datetime(time, units, calendar=calendar) assert all(isinstance(value, expected_date_type) for value in actual) abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, @@ -282,7 +281,7 @@ def test_decode_standard_calendar_single_element_inside_timestamp_range( for num_time in [735368, [735368], [[735368]]]: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = coding.times.decode_cf_datetime(num_time, units, calendar=calendar) + actual = decode_cf_datetime(num_time, units, calendar=calendar) assert actual.dtype == np.dtype("M8[ns]") @@ -295,7 +294,7 @@ def test_decode_non_standard_calendar_single_element_inside_timestamp_range( for num_time in [735368, [735368], [[735368]]]: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = coding.times.decode_cf_datetime(num_time, units, calendar=calendar) + actual = decode_cf_datetime(num_time, units, calendar=calendar) assert actual.dtype == np.dtype("O") @@ -309,9 +308,7 @@ def test_decode_single_element_outside_timestamp_range(calendar) -> None: for num_time in [days, [days], [[days]]]: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = coding.times.decode_cf_datetime( - num_time, units, calendar=calendar - ) + actual = decode_cf_datetime(num_time, units, calendar=calendar) expected = cftime.num2date( days, units, calendar, only_use_cftime_datetimes=True @@ -338,7 +335,7 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( expected1 = times1.values expected2 = times2.values - actual = coding.times.decode_cf_datetime(mdim_time, units, calendar=calendar) + actual = decode_cf_datetime(mdim_time, units, calendar=calendar) assert actual.dtype == np.dtype("M8[ns]") abs_diff1 = abs(actual[:, 0] - expected1) @@ -379,7 +376,7 @@ def test_decode_nonstandard_calendar_multidim_time_inside_timestamp_range( expected_dtype = np.dtype("O") - actual = coding.times.decode_cf_datetime(mdim_time, units, calendar=calendar) + actual = decode_cf_datetime(mdim_time, units, calendar=calendar) assert actual.dtype == expected_dtype abs_diff1 = abs(actual[:, 0] - expected1) @@ -412,7 +409,7 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = coding.times.decode_cf_datetime(mdim_time, units, calendar=calendar) + actual = decode_cf_datetime(mdim_time, units, calendar=calendar) assert actual.dtype == np.dtype("O") @@ -435,7 +432,7 @@ def test_decode_non_standard_calendar_single_element(calendar, num_time) -> None units = "days since 0001-01-01" - actual = coding.times.decode_cf_datetime(num_time, units, calendar=calendar) + actual = decode_cf_datetime(num_time, units, calendar=calendar) expected = np.asarray( cftime.num2date(num_time, units, calendar, only_use_cftime_datetimes=True) @@ -460,9 +457,7 @@ def test_decode_360_day_calendar() -> None: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - actual = coding.times.decode_cf_datetime( - num_times, units, calendar=calendar - ) + actual = decode_cf_datetime(num_times, units, calendar=calendar) assert len(w) == 0 assert actual.dtype == np.dtype("O") @@ -476,8 +471,8 @@ def test_decode_abbreviation() -> None: val = np.array([1586628000000.0]) units = "msecs since 1970-01-01T00:00:00Z" - actual = coding.times.decode_cf_datetime(val, units) - expected = coding.times.cftime_to_nptime(cftime.num2date(val, units)) + actual = decode_cf_datetime(val, units) + expected = cftime_to_nptime(cftime.num2date(val, units)) assert_array_equal(actual, expected) @@ -498,7 +493,7 @@ def test_decode_abbreviation() -> None: def test_cf_datetime_nan(num_dates, units, expected_list) -> None: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "All-NaN") - actual = coding.times.decode_cf_datetime(num_dates, units) + actual = decode_cf_datetime(num_dates, units) # use pandas because numpy will deprecate timezone-aware conversions expected = pd.to_datetime(expected_list).to_numpy(dtype="datetime64[ns]") assert_array_equal(expected, actual) @@ -510,7 +505,7 @@ def test_decoded_cf_datetime_array_2d() -> None: variable = Variable( ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} ) - result = coding.times.CFDatetimeCoder().decode(variable) + result = CFDatetimeCoder().decode(variable) assert result.dtype == "datetime64[ns]" expected = pd.date_range("2000-01-01", periods=4).values.reshape(2, 2) assert_array_equal(np.asarray(result), expected) @@ -531,7 +526,7 @@ def test_decoded_cf_datetime_array_2d() -> None: def test_infer_datetime_units(freq, units) -> None: dates = pd.date_range("2000", periods=2, freq=freq) expected = f"{units} since 2000-01-01 00:00:00" - assert expected == coding.times.infer_datetime_units(dates) + assert expected == infer_datetime_units(dates) @pytest.mark.parametrize( @@ -549,7 +544,7 @@ def test_infer_datetime_units(freq, units) -> None: ], ) def test_infer_datetime_units_with_NaT(dates, expected) -> None: - assert expected == coding.times.infer_datetime_units(dates) + assert expected == infer_datetime_units(dates) _CFTIME_DATETIME_UNITS_TESTS = [ @@ -573,7 +568,7 @@ def test_infer_datetime_units_with_NaT(dates, expected) -> None: def test_infer_cftime_datetime_units(calendar, date_args, expected) -> None: date_type = _all_cftime_date_types()[calendar] dates = [date_type(*args) for args in date_args] - assert expected == coding.times.infer_datetime_units(dates) + assert expected == infer_datetime_units(dates) @pytest.mark.filterwarnings("ignore:Timedeltas can't be serialized faithfully") @@ -600,18 +595,18 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: numbers = np.array(numbers) expected = numbers - actual, _ = coding.times.encode_cf_timedelta(timedeltas, units) + actual, _ = encode_cf_timedelta(timedeltas, units) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype if units is not None: expected = timedeltas - actual = coding.times.decode_cf_timedelta(numbers, units) + actual = decode_cf_timedelta(numbers, units) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype expected = np.timedelta64("NaT", "ns") - actual = coding.times.decode_cf_timedelta(np.array(np.nan), "days") + actual = decode_cf_timedelta(np.array(np.nan), "days") assert_array_equal(expected, actual) @@ -622,7 +617,7 @@ def test_cf_timedelta_2d() -> None: timedeltas = np.atleast_2d(to_timedelta_unboxed(["1D", "2D", "3D"])) expected = timedeltas - actual = coding.times.decode_cf_timedelta(numbers, units) + actual = decode_cf_timedelta(numbers, units) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype @@ -637,7 +632,7 @@ def test_cf_timedelta_2d() -> None: ], ) def test_infer_timedelta_units(deltas, expected) -> None: - assert expected == coding.times.infer_timedelta_units(deltas) + assert expected == infer_timedelta_units(deltas) @requires_cftime @@ -653,7 +648,7 @@ def test_infer_timedelta_units(deltas, expected) -> None: def test_format_cftime_datetime(date_args, expected) -> None: date_types = _all_cftime_date_types() for date_type in date_types.values(): - result = coding.times.format_cftime_datetime(date_type(*date_args)) + result = format_cftime_datetime(date_type(*date_args)) assert result == expected @@ -1008,7 +1003,7 @@ def test_decode_ambiguous_time_warns(calendar) -> None: # we don't decode non-standard calendards with # pandas so expect no warning will be emitted - is_standard_calendar = calendar in coding.times._STANDARD_CALENDARS + is_standard_calendar = calendar in _STANDARD_CALENDARS dates = [1, 2, 3] units = "days since 1-1-1" @@ -1043,9 +1038,9 @@ def test_encode_cf_datetime_defaults_to_correct_dtype( pytest.skip("Nanosecond frequency is not valid for cftime dates.") times = date_range("2000", periods=3, freq=freq) units = f"{encoding_units} since 2000-01-01" - encoded, _units, _ = coding.times.encode_cf_datetime(times, units) + encoded, _units, _ = encode_cf_datetime(times, units) - numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units) + numpy_timeunit = _netcdf_to_numpy_timeunit(encoding_units) encoding_units_as_timedelta = np.timedelta64(1, numpy_timeunit) if pd.to_timedelta(1, freq) >= encoding_units_as_timedelta: assert encoded.dtype == np.int64 @@ -1202,7 +1197,7 @@ def test_decode_float_datetime(): def test_scalar_unit() -> None: # test that a scalar units (often NaN when using to_netcdf) does not raise an error variable = Variable(("x", "y"), np.array([[0, 1], [2, 3]]), {"units": np.nan}) - result = coding.times.CFDatetimeCoder().decode(variable) + result = CFDatetimeCoder().decode(variable) assert np.isnan(result.attrs["units"]) From 7ff5d8d1f367898f9e09a83ffe993fac9e90047b Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Thu, 11 Jul 2024 03:57:16 +0200 Subject: [PATCH 5/9] Use reshape and ravel from duck_array_ops in coding/times.py (#9225) * Use duck_array_ops.ravel * Use duck_array_ops.reshape --- xarray/coding/times.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 34d4f9a23ad..50a2ba93c09 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -22,7 +22,7 @@ ) from xarray.core import indexing from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like -from xarray.core.duck_array_ops import asarray +from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item from xarray.core.pdcompat import nanosecond_precision_timestamp from xarray.core.utils import emit_user_level_warning @@ -315,7 +315,7 @@ def decode_cf_datetime( cftime.num2date """ num_dates = np.asarray(num_dates) - flat_num_dates = num_dates.ravel() + flat_num_dates = ravel(num_dates) if calendar is None: calendar = "standard" @@ -348,7 +348,7 @@ def decode_cf_datetime( else: dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) - return dates.reshape(num_dates.shape) + return reshape(dates, num_dates.shape) def to_timedelta_unboxed(value, **kwargs): @@ -369,8 +369,8 @@ def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: """ num_timedeltas = np.asarray(num_timedeltas) units = _netcdf_to_numpy_timeunit(units) - result = to_timedelta_unboxed(num_timedeltas.ravel(), unit=units) - return result.reshape(num_timedeltas.shape) + result = to_timedelta_unboxed(ravel(num_timedeltas), unit=units) + return reshape(result, num_timedeltas.shape) def _unit_timedelta_cftime(units: str) -> timedelta: @@ -428,7 +428,7 @@ def infer_datetime_units(dates) -> str: 'hours', 'minutes' or 'seconds' (the first one that can evenly divide all unique time deltas in `dates`) """ - dates = np.asarray(dates).ravel() + dates = ravel(np.asarray(dates)) if np.asarray(dates).dtype == "datetime64[ns]": dates = to_datetime_unboxed(dates) dates = dates[pd.notnull(dates)] @@ -456,7 +456,7 @@ def infer_timedelta_units(deltas) -> str: {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly divide all unique time deltas in `deltas`) """ - deltas = to_timedelta_unboxed(np.asarray(deltas).ravel()) + deltas = to_timedelta_unboxed(ravel(np.asarray(deltas))) unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) return _infer_time_units_from_diff(unique_timedeltas) @@ -643,7 +643,7 @@ def encode_datetime(d): except TypeError: return np.nan if d is None else cftime.date2num(d, units, calendar) - return np.array([encode_datetime(d) for d in dates.ravel()]).reshape(dates.shape) + return reshape(np.array([encode_datetime(d) for d in ravel(dates)]), dates.shape) def cast_to_int_if_safe(num) -> np.ndarray: @@ -753,7 +753,7 @@ def _eagerly_encode_cf_datetime( # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from # dates to be encoded (GH 2272). - dates_as_index = pd.DatetimeIndex(dates.ravel()) + dates_as_index = pd.DatetimeIndex(ravel(dates)) time_deltas = dates_as_index - ref_date # retrieve needed units to faithfully encode to int64 @@ -791,7 +791,7 @@ def _eagerly_encode_cf_datetime( floor_division = True num = _division(time_deltas, time_delta, floor_division) - num = num.values.reshape(dates.shape) + num = reshape(num.values, dates.shape) except (OutOfBoundsDatetime, OverflowError, ValueError): num = _encode_datetime_with_cftime(dates, units, calendar) @@ -879,7 +879,7 @@ def _eagerly_encode_cf_timedelta( units = data_units time_delta = _time_units_to_timedelta64(units) - time_deltas = pd.TimedeltaIndex(timedeltas.ravel()) + time_deltas = pd.TimedeltaIndex(ravel(timedeltas)) # retrieve needed units to faithfully encode to int64 needed_units = data_units @@ -911,7 +911,7 @@ def _eagerly_encode_cf_timedelta( floor_division = True num = _division(time_deltas, time_delta, floor_division) - num = num.values.reshape(timedeltas.shape) + num = reshape(num.values, timedeltas.shape) if dtype is not None: num = _cast_to_dtype_if_safe(num, dtype) From eb0fbd7d2692690038e96b32a816a36ea4267a8d Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Thu, 11 Jul 2024 03:58:00 +0200 Subject: [PATCH 6/9] Use duckarray assertions in test_coding_times (#9226) --- xarray/tests/test_coding_times.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index d568bdc3268..623e4e9f970 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -44,6 +44,8 @@ FirstElementAccessibleArray, arm_xfail, assert_array_equal, + assert_duckarray_allclose, + assert_duckarray_equal, assert_no_warnings, has_cftime, requires_cftime, @@ -144,13 +146,13 @@ def test_cf_datetime(num_dates, units, calendar) -> None: assert (abs_diff <= np.timedelta64(1, "s")).all() encoded, _, _ = encode_cf_datetime(actual, units, calendar) - assert_array_equal(num_dates, np.round(encoded, 1)) + assert_duckarray_allclose(num_dates, encoded) if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units: # verify that wrapping with a pandas.Index works # note that it *does not* currently work to put # non-datetime64 compatible dates into a pandas.Index encoded, _, _ = encode_cf_datetime(pd.Index(actual), units, calendar) - assert_array_equal(num_dates, np.round(encoded, 1)) + assert_duckarray_allclose(num_dates, encoded) @requires_cftime @@ -893,10 +895,10 @@ def test_time_units_with_timezone_roundtrip(calendar) -> None: ) if calendar in _STANDARD_CALENDARS: - np.testing.assert_array_equal(result_num_dates, expected_num_dates) + assert_duckarray_equal(result_num_dates, expected_num_dates) else: # cftime datetime arithmetic is not quite exact. - np.testing.assert_allclose(result_num_dates, expected_num_dates) + assert_duckarray_allclose(result_num_dates, expected_num_dates) assert result_units == expected_units assert result_calendar == calendar From ff15a08bea27674923afa494b303c6e5cb4d513c Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Wed, 10 Jul 2024 22:00:09 -0400 Subject: [PATCH 7/9] Fix time indexing regression in `convert_calendar` (#9192) * MRC -- Selecting with string for cftime See discussion in #9138 This commit and pull request mostly serves as a staging group for a potential fix. Test with: ``` pytest xarray/tests/test_cftimeindex.py::test_cftime_noleap_with_str ``` * effectively remove fastpath * Add docstring * Revert "effectively remove fastpath" This reverts commit 0f1a5a2271e5522b5dd946d7f4f38f591211286e. * Fix by reassigning coordinate * Update what's new entry * Simplify if condition --------- Co-authored-by: Spencer Clark --- doc/whats-new.rst | 6 ++++++ xarray/coding/calendar_ops.py | 12 +++++++++++- xarray/tests/test_calendar_ops.py | 25 ++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0c401c2348e..f237b406bd5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -47,6 +47,12 @@ Bug fixes By `Justus Magin `_. - Promote floating-point numeric datetimes before decoding (:issue:`9179`, :pull:`9182`). By `Justus Magin `_. +- Address regression introduced in :pull:`9002` that prevented objects returned + by py:meth:`DataArray.convert_calendar` to be indexed by a time index in + certain circumstances (:issue:`9138`, :pull:`9192`). By `Mark Harfouche + `_ and `Spencer Clark + `. + - Fiy static typing of tolerance arguments by allowing `str` type (:issue:`8892`, :pull:`9194`). By `Michael Niklas `_. - Dark themes are now properly detected for ``html[data-theme=dark]``-tags (:pull:`9200`). diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py index c4fe9e1f4ae..6f492e78bf9 100644 --- a/xarray/coding/calendar_ops.py +++ b/xarray/coding/calendar_ops.py @@ -5,7 +5,10 @@ from xarray.coding.cftime_offsets import date_range_like, get_date_type from xarray.coding.cftimeindex import CFTimeIndex -from xarray.coding.times import _should_cftime_be_used, convert_times +from xarray.coding.times import ( + _should_cftime_be_used, + convert_times, +) from xarray.core.common import _contains_datetime_like_objects, is_np_datetime_like try: @@ -222,6 +225,13 @@ def convert_calendar( # Remove NaN that where put on invalid dates in target calendar out = out.where(out[dim].notnull(), drop=True) + if use_cftime: + # Reassign times to ensure time index of output is a CFTimeIndex + # (previously it was an Index due to the presence of NaN values). + # Note this is not needed in the case that the output time index is + # a DatetimeIndex, since DatetimeIndexes can handle NaN values. + out[dim] = CFTimeIndex(out[dim].data) + if missing is not None: time_target = date_range_like(time, calendar=calendar, use_cftime=use_cftime) out = out.reindex({dim: time_target}, fill_value=missing) diff --git a/xarray/tests/test_calendar_ops.py b/xarray/tests/test_calendar_ops.py index 7d229371808..13e9f7a1030 100644 --- a/xarray/tests/test_calendar_ops.py +++ b/xarray/tests/test_calendar_ops.py @@ -1,9 +1,10 @@ from __future__ import annotations import numpy as np +import pandas as pd import pytest -from xarray import DataArray, infer_freq +from xarray import CFTimeIndex, DataArray, infer_freq from xarray.coding.calendar_ops import convert_calendar, interp_calendar from xarray.coding.cftime_offsets import date_range from xarray.testing import assert_identical @@ -286,3 +287,25 @@ def test_interp_calendar_errors(): ValueError, match="Both 'source.x' and 'target' must contain datetime objects." ): interp_calendar(da1, da2, dim="x") + + +@requires_cftime +@pytest.mark.parametrize( + ("source_calendar", "target_calendar", "expected_index"), + [("standard", "noleap", CFTimeIndex), ("all_leap", "standard", pd.DatetimeIndex)], +) +def test_convert_calendar_produces_time_index( + source_calendar, target_calendar, expected_index +): + # https://github.com/pydata/xarray/issues/9138 + time = date_range("2000-01-01", "2002-01-01", freq="D", calendar=source_calendar) + temperature = np.ones(len(time)) + da = DataArray( + data=temperature, + dims=["time"], + coords=dict( + time=time, + ), + ) + converted = da.convert_calendar(target_calendar) + assert isinstance(converted.indexes["time"], expected_index) From 7087ca49629e07be004d92fd08f916e3359a57e1 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 11 Jul 2024 10:54:18 +0200 Subject: [PATCH 8/9] `numpy` 2 compatibility in the `netcdf4` and `h5netcdf` backends (#9136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * don't remove `netcdf4` from the upstream-dev environment * also stop removing `h5py` and `hdf5` * hard-code the precision (I believe this was missed in #9081) * don't remove `h5py` either * use on-diks _FillValue as standrd expects, use view instead of cast to prevent OverflowError. * whats-new * unpin `numpy` * rework UnsignedCoder * add test * Update xarray/coding/variables.py Co-authored-by: Justus Magin --------- Co-authored-by: Kai Mühlbauer Co-authored-by: Kai Mühlbauer Co-authored-by: Deepak Cherian --- ci/install-upstream-wheels.sh | 5 ++-- ci/requirements/all-but-dask.yml | 2 +- ci/requirements/environment-windows.yml | 2 +- ci/requirements/environment.yml | 2 +- doc/whats-new.rst | 4 ++- xarray/coding/variables.py | 16 +++++++----- xarray/tests/test_backends.py | 33 +++++++++++++++++++++++-- 7 files changed, 49 insertions(+), 15 deletions(-) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index d728768439a..79fae3c46a9 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -13,7 +13,7 @@ $conda remove -y numba numbagg sparse # temporarily remove numexpr $conda remove -y numexpr # temporarily remove backends -$conda remove -y cf_units hdf5 h5py netcdf4 pydap +$conda remove -y cf_units pydap # forcibly remove packages to avoid artifacts $conda remove -y --force \ numpy \ @@ -37,8 +37,7 @@ python -m pip install \ numpy \ scipy \ matplotlib \ - pandas \ - h5py + pandas # for some reason pandas depends on pyarrow already. # Remove once a `pyarrow` version compiled with `numpy>=2.0` is on `conda-forge` python -m pip install \ diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml index abf6a88690a..119db282ad9 100644 --- a/ci/requirements/all-but-dask.yml +++ b/ci/requirements/all-but-dask.yml @@ -22,7 +22,7 @@ dependencies: - netcdf4 - numba - numbagg - - numpy<2 + - numpy - packaging - pandas - pint>=0.22 diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 2eedc9b0621..896e390ea3e 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -23,7 +23,7 @@ dependencies: - netcdf4 - numba - numbagg - - numpy<2 + - numpy - packaging - pandas # - pint>=0.22 diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 317e1fe5f41..ef02a3e7f23 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -26,7 +26,7 @@ dependencies: - numba - numbagg - numexpr - - numpy<2 + - numpy - opt_einsum - packaging - pandas diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f237b406bd5..e8369dc2f40 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,6 +45,8 @@ Bug fixes By `Pontus Lurcock `_. - Allow diffing objects with array attributes on variables (:issue:`9153`, :pull:`9169`). By `Justus Magin `_. +- ``numpy>=2`` compatibility in the ``netcdf4`` backend (:pull:`9136`). + By `Justus Magin `_ and `Kai Mühlbauer `_. - Promote floating-point numeric datetimes before decoding (:issue:`9179`, :pull:`9182`). By `Justus Magin `_. - Address regression introduced in :pull:`9002` that prevented objects returned @@ -67,7 +69,7 @@ Documentation - Adds a flow-chart diagram to help users navigate help resources (`Discussion #8990 `_). By `Jessica Scheick `_. - Improvements to Zarr & chunking docs (:pull:`9139`, :pull:`9140`, :pull:`9132`) - By `Maximilian Roos `_ + By `Maximilian Roos `_. Internal Changes diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index d31cb6e626a..d19f285d2b9 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -516,10 +516,13 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_encoding(variable) pop_to(encoding, attrs, "_Unsigned") - signed_dtype = np.dtype(f"i{data.dtype.itemsize}") + # we need the on-disk type here + # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available + signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}")) if "_FillValue" in attrs: - new_fill = signed_dtype.type(attrs["_FillValue"]) - attrs["_FillValue"] = new_fill + new_fill = np.array(attrs["_FillValue"]) + # use view here to prevent OverflowError + attrs["_FillValue"] = new_fill.view(signed_dtype).item() data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) return Variable(dims, data, attrs, encoding, fastpath=True) @@ -535,10 +538,11 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: if unsigned == "true": unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") transform = partial(np.asarray, dtype=unsigned_dtype) - data = lazy_elemwise_func(data, transform, unsigned_dtype) if "_FillValue" in attrs: - new_fill = unsigned_dtype.type(attrs["_FillValue"]) - attrs["_FillValue"] = new_fill + new_fill = np.array(attrs["_FillValue"], dtype=data.dtype) + # use view here to prevent OverflowError + attrs["_FillValue"] = new_fill.view(unsigned_dtype).item() + data = lazy_elemwise_func(data, transform, unsigned_dtype) elif data.dtype.kind == "u": if unsigned == "false": signed_dtype = np.dtype(f"i{data.dtype.itemsize}") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 15485dc178a..0b90a05262d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -166,7 +166,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset: def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { - "_FillValue": 255, + "_FillValue": np.int8(-1), "_Unsigned": "true", "dtype": "i1", "add_offset": dtype.type(10), @@ -925,6 +925,35 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) + @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255)]) + def test_roundtrip_unsigned(self, fillvalue): + # regression/numpy2 test for + encoding = { + "_FillValue": fillvalue, + "_Unsigned": "true", + "dtype": "i1", + } + x = np.array([0, 1, 127, 128, 254, np.nan], dtype=np.float32) + decoded = Dataset({"x": ("t", x, {}, encoding)}) + + attributes = { + "_FillValue": fillvalue, + "_Unsigned": "true", + } + # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned + sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1") + encoded = Dataset({"x": ("t", sb, attributes)}) + + with self.roundtrip(decoded) as actual: + for k in decoded.variables: + assert decoded.variables[k].dtype == actual.variables[k].dtype + assert_allclose(decoded, actual, decode_bytes=False) + + with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: + for k in encoded.variables: + assert encoded.variables[k].dtype == actual.variables[k].dtype + assert_allclose(encoded, actual, decode_bytes=False) + @staticmethod def _create_cf_dataset(): original = Dataset( @@ -4285,7 +4314,7 @@ def test_roundtrip_coordinates_with_space(self) -> None: def test_roundtrip_numpy_datetime_data(self) -> None: # Override method in DatasetIOBase - remove not applicable # save_kwargs - times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"]) + times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"], unit="ns") expected = Dataset({"t": ("t", times), "t0": times[0]}) with self.roundtrip(expected) as actual: assert_identical(expected, actual) From e12aa447c31908c9022d33c226f1481763a6ed9a Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 11 Jul 2024 11:25:47 +0200 Subject: [PATCH 9/9] `numpy` 2 compatibility in the iris code paths (#9156) * don't remove `cf_units` (and thus `iris`) [skip-ci] * try keeping netcdf4, h5netcdf, and h5py --- ci/install-upstream-wheels.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 79fae3c46a9..855336ad8bd 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -13,7 +13,7 @@ $conda remove -y numba numbagg sparse # temporarily remove numexpr $conda remove -y numexpr # temporarily remove backends -$conda remove -y cf_units pydap +$conda remove -y pydap # forcibly remove packages to avoid artifacts $conda remove -y --force \ numpy \