From 122b19b43ccf9e09db4040e1c49145312873b173 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 12 Jan 2025 17:13:46 -0500 Subject: [PATCH 1/3] Fix timedelta encoding overflow issue; always decode to ns resolution --- xarray/coding/times.py | 32 ++++++++++--------------------- xarray/tests/test_coding_times.py | 5 ++--- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index e23aa793ebd..eaa389def61 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -542,19 +542,6 @@ def decode_cf_datetime( return reshape(dates, num_dates.shape) -def to_timedelta_unboxed(value, **kwargs): - # todo: check, if the procedure here is correct - result = pd.to_timedelta(value, **kwargs).to_numpy() - unique_timedeltas = np.unique(result[pd.notnull(result)]) - unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) - if unit not in {"s", "ms", "us", "ns"}: - # default to "ns", when not specified - unit = "ns" - result = result.astype(f"timedelta64[{unit}]") - assert np.issubdtype(result.dtype, "timedelta64") - return result - - def to_datetime_unboxed(value, **kwargs): result = pd.to_datetime(value, **kwargs).to_numpy() assert np.issubdtype(result.dtype, "datetime64") @@ -604,21 +591,22 @@ def _numbers_to_timedelta( return flat_num.astype(f"timedelta64[{time_unit}]") -def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: - # todo: check, if this works as intended +def decode_cf_timedelta( + num_timedeltas, units: str, time_unit: str = "ns" +) -> np.ndarray: """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64 ["s", "ms", "us", "ns"] array. + numpy timedelta64[ns] array. """ + # TODO: add a time_unit argument to enable modifying the decoding + # resolution analogous to decode_cf_datetime. Currently we only support + # decoding to "ns" resolution. + time_unit = "ns" num_timedeltas = np.asarray(num_timedeltas) unit = _netcdf_to_numpy_timeunit(units) timedeltas = _numbers_to_timedelta(num_timedeltas, unit, "s", "timedelta") - as_unit = unit - if unit not in {"s", "ms", "us", "ns"}: - # default to "ns", when not specified - as_unit = "ns" - result = pd.to_timedelta(ravel(timedeltas)).as_unit(as_unit).to_numpy() + result = pd.to_timedelta(ravel(timedeltas)).as_unit(time_unit).to_numpy() return reshape(result, timedeltas.shape) @@ -700,7 +688,7 @@ def infer_timedelta_units(deltas) -> str: {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly divide all unique time deltas in `deltas`) """ - deltas = to_timedelta_unboxed(ravel(np.asarray(deltas))) + deltas = ravel(deltas) unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) return _infer_time_units_from_diff(unique_timedeltas) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 6be9b1c475a..58adfede70c 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -34,7 +34,6 @@ format_cftime_datetime, infer_datetime_units, infer_timedelta_units, - to_timedelta_unboxed, ) from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder @@ -635,7 +634,7 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: if timedeltas == "NaT": timedeltas = np.timedelta64("NaT", "ns") else: - timedeltas = to_timedelta_unboxed(timedeltas) + timedeltas = pd.to_timedelta(timedeltas).to_numpy() numbers = np.array(numbers) expected = numbers @@ -659,7 +658,7 @@ def test_cf_timedelta_2d() -> None: units = "days" numbers = np.atleast_2d([1, 2, 3]) - timedeltas = np.atleast_2d(to_timedelta_unboxed(["1D", "2D", "3D"])) + timedeltas = np.atleast_2d(pd.to_timedelta(["1D", "2D", "3D"]).to_numpy()) expected = timedeltas actual = decode_cf_timedelta(numbers, units) From 99932e9ed4363120f90a533773d5cd9ce96f6f07 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 12 Jan 2025 18:39:35 -0500 Subject: [PATCH 2/3] Implement time_unit for decode_cf_timedelta --- xarray/coding/times.py | 42 +++++++++++++++++++++++++----- xarray/tests/test_coding_times.py | 43 +++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 12 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index eaa389def61..94496d97894 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -365,6 +365,21 @@ def _check_date_for_units_since_refdate( return pd.Timestamp("NaT") +def _check_timedelta_range(value, data_unit, time_unit): + if value > np.iinfo("int64").max or value < np.iinfo("int64").min: + OutOfBoundsTimedelta(f"Value {value} can't be represented as Timedelta.") + delta = value * np.timedelta64(1, data_unit) + if not np.isnan(delta): + # this will raise on dtype overflow for integer dtypes + if value.dtype.kind in "u" and not np.int64(delta) == value: + raise OutOfBoundsTimedelta( + "DType overflow in Datetime/Timedelta calculation." + ) + # this will raise on overflow if delta cannot be represented with the + # resolutions supported by pandas. + pd.to_timedelta(delta) + + def _align_reference_date_and_unit( ref_date: pd.Timestamp, unit: NPDatetimeUnitOptions ) -> pd.Timestamp: @@ -595,19 +610,32 @@ def decode_cf_timedelta( num_timedeltas, units: str, time_unit: str = "ns" ) -> np.ndarray: """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64[ns] array. + numpy timedelta64 {"s", "ms", "us", "ns"} array. """ - # TODO: add a time_unit argument to enable modifying the decoding - # resolution analogous to decode_cf_datetime. Currently we only support - # decoding to "ns" resolution. - time_unit = "ns" num_timedeltas = np.asarray(num_timedeltas) unit = _netcdf_to_numpy_timeunit(units) + _check_timedelta_range(num_timedeltas.min(), unit, time_unit) + _check_timedelta_range(num_timedeltas.max(), unit, time_unit) + timedeltas = _numbers_to_timedelta(num_timedeltas, unit, "s", "timedelta") + timedeltas = pd.to_timedelta(ravel(timedeltas)) + + if np.isnat(timedeltas).all(): + empirical_unit = time_unit + else: + empirical_unit = timedeltas.unit + + if np.timedelta64(1, time_unit) > np.timedelta64(1, empirical_unit): + time_unit = empirical_unit + + if time_unit not in {"s", "ms", "us", "ns"}: + raise ValueError( + f"time_unit must be one of 's', 'ms', 'us', or 'ns'. Got: {time_unit}" + ) - result = pd.to_timedelta(ravel(timedeltas)).as_unit(time_unit).to_numpy() - return reshape(result, timedeltas.shape) + result = timedeltas.as_unit(time_unit).to_numpy() + return reshape(result, num_timedeltas.shape) def _unit_timedelta_cftime(units: str) -> timedelta: diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 58adfede70c..b85c57a5597 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -631,10 +631,11 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected) -> None: ], ) def test_cf_timedelta(timedeltas, units, numbers) -> None: + time_unit = "ns" if timedeltas == "NaT": - timedeltas = np.timedelta64("NaT", "ns") + timedeltas = np.timedelta64("NaT", time_unit) else: - timedeltas = pd.to_timedelta(timedeltas).to_numpy() + timedeltas = pd.to_timedelta(timedeltas).as_unit(time_unit).to_numpy() numbers = np.array(numbers) expected = numbers @@ -644,12 +645,12 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: if units is not None: expected = timedeltas - actual = decode_cf_timedelta(numbers, units) + actual = decode_cf_timedelta(numbers, units, time_unit) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype - expected = np.timedelta64("NaT", "ns") - actual = decode_cf_timedelta(np.array(np.nan), "days") + expected = np.timedelta64("NaT", time_unit) + actual = decode_cf_timedelta(np.array(np.nan), "days", time_unit) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype @@ -666,6 +667,38 @@ def test_cf_timedelta_2d() -> None: assert expected.dtype == actual.dtype +@pytest.mark.parametrize("encoding_unit", FREQUENCIES_TO_ENCODING_UNITS.values()) +def test_decode_cf_timedelta_time_unit(time_unit, encoding_unit) -> None: + encoded = 1 + encoding_unit_as_numpy = _netcdf_to_numpy_timeunit(encoding_unit) + if np.timedelta64(1, time_unit) > np.timedelta64(1, encoding_unit_as_numpy): + expected = np.timedelta64(encoded, encoding_unit_as_numpy) + else: + expected = np.timedelta64(encoded, encoding_unit_as_numpy).astype( + f"timedelta64[{time_unit}]" + ) + result = decode_cf_timedelta(encoded, encoding_unit, time_unit) + assert result == expected + assert result.dtype == expected.dtype + + +def test_decode_cf_timedelta_time_unit_out_of_bounds(time_unit): + # Define a scale factor that will guarantee overflow with the given + # time_unit. + scale_factor = np.timedelta64(1, time_unit) // np.timedelta64(1, "ns") + encoded = scale_factor * 300 * 365 + with pytest.raises(OutOfBoundsTimedelta): + decode_cf_timedelta(encoded, "days", time_unit) + + +def test_cf_timedelta_roundtrip_large_value(time_unit): + value = np.timedelta64(np.iinfo(np.int64).max, time_unit) + encoded, units = encode_cf_timedelta(value) + decoded = decode_cf_timedelta(encoded, units, time_unit=time_unit) + assert value == decoded + assert value.dtype == decoded.dtype + + @pytest.mark.parametrize( ["deltas", "expected"], [ From 97bb93eae22af13aa85d800314dde5671d967f9e Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 12 Jan 2025 18:44:18 -0500 Subject: [PATCH 3/3] Reduce diff --- xarray/coding/times.py | 2 +- xarray/tests/test_coding_times.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 94496d97894..39e7c94c366 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -610,7 +610,7 @@ def decode_cf_timedelta( num_timedeltas, units: str, time_unit: str = "ns" ) -> np.ndarray: """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64 {"s", "ms", "us", "ns"} array. + numpy timedelta64 ["s", "ms", "us", "ns"] array. """ num_timedeltas = np.asarray(num_timedeltas) unit = _netcdf_to_numpy_timeunit(units) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index b85c57a5597..ed387e22d60 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -631,11 +631,10 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected) -> None: ], ) def test_cf_timedelta(timedeltas, units, numbers) -> None: - time_unit = "ns" if timedeltas == "NaT": - timedeltas = np.timedelta64("NaT", time_unit) + timedeltas = np.timedelta64("NaT", "ns") else: - timedeltas = pd.to_timedelta(timedeltas).as_unit(time_unit).to_numpy() + timedeltas = pd.to_timedelta(timedeltas).to_numpy() numbers = np.array(numbers) expected = numbers @@ -645,12 +644,12 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: if units is not None: expected = timedeltas - actual = decode_cf_timedelta(numbers, units, time_unit) + actual = decode_cf_timedelta(numbers, units) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype - expected = np.timedelta64("NaT", time_unit) - actual = decode_cf_timedelta(np.array(np.nan), "days", time_unit) + expected = np.timedelta64("NaT", "ns") + actual = decode_cf_timedelta(np.array(np.nan), "days") assert_array_equal(expected, actual) assert expected.dtype == actual.dtype