Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve nanosecond resolution when encoding/decoding times #7827

Merged
merged 26 commits into from
Sep 17, 2023
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
db94bb9
preserve nanosecond resolution when encoding/decoding times.
kmuehlbauer Aug 31, 2023
82b74ec
Apply suggestions from code review
kmuehlbauer Sep 4, 2023
8da55ac
use emit_user_level_warning
kmuehlbauer Sep 4, 2023
96f60fc
move time alignment for nc3 to encode_nc3_variable
kmuehlbauer Sep 4, 2023
4f6440a
fix test for encode_cf_timedelta
kmuehlbauer Sep 4, 2023
f8e961f
fix CFMaskCoder for time-like (also allow timedelta64), add first tests
kmuehlbauer Sep 4, 2023
c97d7b4
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 5, 2023
06ed4bf
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 5, 2023
5e4a5ee
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 7, 2023
2365da0
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 11, 2023
d020bbc
rename to _unpack_time_units_and_ref_date as suggested in review
kmuehlbauer Sep 10, 2023
a8c6057
refactor delta -> time_units as suggested in review
kmuehlbauer Sep 10, 2023
9b96ff7
refactor out function _time_units_to_timedelta64, reorder flow and re…
kmuehlbauer Sep 10, 2023
5adb58e
import _is_time_like from coding.variables
kmuehlbauer Sep 11, 2023
87fbb1a
adapt tests, add _numpy_to_netcdf_timeunit-conversion function
kmuehlbauer Sep 11, 2023
1fbc881
adapt tests, add _numpy_to_netcdf_timeunit-conversion function
kmuehlbauer Sep 11, 2023
8a6ecb5
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 11, 2023
c1f810c
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 11, 2023
7f2b8b4
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 13, 2023
d538ea9
adapt test as per review, remove arm_xfail for backend test
kmuehlbauer Sep 13, 2023
a75e4b8
add whats-new.rst entry
kmuehlbauer Sep 13, 2023
d4a71cd
Update doc/whats-new.rst
kmuehlbauer Sep 14, 2023
4dca66e
Update doc/whats-new.rst
kmuehlbauer Sep 14, 2023
ee1b939
Merge remote-tracking branch 'origin/main' into nanoseconds-resolution
kmuehlbauer Sep 16, 2023
ebb00b8
fix whats-new.rst
kmuehlbauer Sep 16, 2023
c4d4a92
Merge remote-tracking branch 'origin/main' into nanoseconds-resolution
kmuehlbauer Sep 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ Bug fixes
- Fix bug where :py:class:`DataArray` instances on the right-hand side
of :py:meth:`DataArray.__setitem__` lose dimension names.
(:issue:`7030`, :pull:`8067`) By `Darsh Ranjan <https://github.com/dranjan>`_.
- Fixed a bug where casting from ``float`` to ``int64`` (undefined for ``NaN``) lead to varying
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
issues (:issue:`7817`, :issue:`7942`, :issue:`7790`, :issue:`6191`, :issue:`7096`,
:issue:`1064`, :pull:`7827`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Documentation
~~~~~~~~~~~~~
Expand All @@ -73,6 +77,8 @@ Internal Changes

- Many error messages related to invalid dimensions or coordinates now always show the list of valid dims/coords (:pull:`8079`).
By `András Gunyhó <https://github.com/mgunyho>`_.
- Refactor of encoding and decoding times/timedeltas to preserve nanosecond resolution (:pull:`7827`).
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

.. _whats-new.2023.08.0:

Expand Down
19 changes: 18 additions & 1 deletion xarray/backends/netcdf3.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,30 @@ def encode_nc3_attrs(attrs):
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}


def _maybe_prepare_times(var):
# checks for integer-based time-like and
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
# this keeps backwards compatibility

data = var.data
if data.dtype.kind in "iu":
units = var.attrs.get("units", None)
if units is not None:
if coding.variables._is_time_like(units):
mask = data == np.iinfo(np.int64).min
if mask.any():
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
return data


def encode_nc3_variable(var):
for coder in [
coding.strings.EncodedStringCoder(allows_unicode=False),
coding.strings.CharacterArrayCoder(),
]:
var = coder.encode(var)
data = coerce_nc3_dtype(var.data)
data = _maybe_prepare_times(var)
data = coerce_nc3_dtype(data)
attrs = encode_nc3_attrs(var.attrs)
return Variable(var.dims, data, attrs, var.encoding)

Expand Down
139 changes: 104 additions & 35 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from xarray.core.formatting import first_n_items, format_timestamp, last_item
from xarray.core.pdcompat import nanosecond_precision_timestamp
from xarray.core.pycompat import is_duck_dask_array
from xarray.core.utils import emit_user_level_warning
from xarray.core.variable import Variable

try:
Expand Down Expand Up @@ -122,6 +123,18 @@ def _netcdf_to_numpy_timeunit(units: str) -> str:
}[units]


def _numpy_to_netcdf_timeunit(units: str) -> str:
return {
"ns": "nanoseconds",
"us": "microseconds",
"ms": "milliseconds",
"s": "seconds",
"m": "minutes",
"h": "hours",
"D": "days",
}[units]


def _ensure_padded_year(ref_date: str) -> str:
# Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
# are ambiguous (is it YMD or DMY?). This can lead to some very odd
Expand Down Expand Up @@ -171,6 +184,20 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
time_units, _ref_date = _unpack_netcdf_time_units(units)
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
return time_units, ref_date


def _decode_cf_datetime_dtype(
data, units: str, calendar: str, use_cftime: bool | None
) -> np.dtype:
Expand Down Expand Up @@ -222,8 +249,8 @@ def _decode_datetime_with_pandas(
"pandas."
)

delta, ref_date = _unpack_netcdf_time_units(units)
delta = _netcdf_to_numpy_timeunit(delta)
time_units, ref_date = _unpack_netcdf_time_units(units)
time_units = _netcdf_to_numpy_timeunit(time_units)
try:
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
Expand All @@ -237,8 +264,8 @@ def _decode_datetime_with_pandas(
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
if flat_num_dates.size > 0:
# avoid size 0 datetimes GH1329
pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
pd.to_timedelta(flat_num_dates.max(), delta) + ref_date
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date

# To avoid integer overflow when converting to nanosecond units for integer
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
Expand All @@ -251,9 +278,12 @@ def _decode_datetime_with_pandas(

# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
# works much faster when dealing with integers (GH 1399).
flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype(
np.int64
)
# properly handle NaN/NaT to prevent casting NaN to int
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# Use pd.to_timedelta to safely cast integer values to timedeltas,
# and add those to a Timestamp to safely produce a DatetimeIndex. This
Expand Down Expand Up @@ -364,6 +394,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:
return "seconds"


def _time_units_to_timedelta64(units: str) -> np.timedelta64:
return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]")


def infer_calendar_name(dates) -> CFCalendar:
"""Given an array of datetimes, infer the CF calendar name"""
if is_np_datetime_like(dates.dtype):
Expand Down Expand Up @@ -572,9 +606,12 @@ def _should_cftime_be_used(


def _cleanup_netcdf_time_units(units: str) -> str:
delta, ref_date = _unpack_netcdf_time_units(units)
time_units, ref_date = _unpack_netcdf_time_units(units)
time_units = time_units.lower()
if not time_units.endswith("s"):
time_units = f"{time_units}s"
try:
units = f"{delta} since {format_timestamp(ref_date)}"
units = f"{time_units} since {format_timestamp(ref_date)}"
except (OutOfBoundsDatetime, ValueError):
# don't worry about reifying the units if they're out of bounds or
# formatted badly
Expand Down Expand Up @@ -633,62 +670,93 @@ def encode_cf_datetime(
"""
dates = np.asarray(dates)

data_units = infer_datetime_units(dates)

if units is None:
units = infer_datetime_units(dates)
units = data_units
else:
units = _cleanup_netcdf_time_units(units)

if calendar is None:
calendar = infer_calendar_name(dates)

delta, _ref_date = _unpack_netcdf_time_units(units)
try:
if not _is_standard_calendar(calendar) or dates.dtype.kind == "O":
# parse with cftime instead
raise OutOfBoundsDatetime
assert dates.dtype == "datetime64[ns]"

delta_units = _netcdf_to_numpy_timeunit(delta)
time_delta = np.timedelta64(1, delta_units).astype("timedelta64[ns]")

# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
time_units, ref_date = _unpack_time_units_and_ref_date(units)
time_delta = _time_units_to_timedelta64(time_units)

# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
# retrieve needed units to faithfully encode to int64
needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
if data_units != units:
# this accounts for differences in the reference times
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
if ref_delta > np.timedelta64(0, "ns"):
needed_units = _infer_time_units_from_diff(ref_delta)

# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
# an OverflowError is raised if the ref_date is too far away from
# dates to be encoded (GH 2272).
dates_as_index = pd.DatetimeIndex(dates.ravel())
time_deltas = dates_as_index - ref_date

# Use floor division if time_delta evenly divides all differences
# to preserve integer dtype if possible (GH 4045).
if np.all(time_deltas % time_delta == np.timedelta64(0, "ns")):
num = time_deltas // time_delta
# needed time delta to encode faithfully to int64
needed_time_delta = _time_units_to_timedelta64(needed_units)
if time_delta <= needed_time_delta:
# calculate int64 floor division
# to preserve integer dtype if possible (GH 4045, GH7817).
num = time_deltas // time_delta.astype(np.int64)
num = num.astype(np.int64, copy=False)
else:
emit_user_level_warning(
f"Times can't be serialized faithfully with requested units {units!r}. "
f"Resolution of {needed_units!r} needed. "
f"Serializing timeseries to floating point."
)
num = time_deltas / time_delta
num = num.values.reshape(dates.shape)

except (OutOfBoundsDatetime, OverflowError, ValueError):
num = _encode_datetime_with_cftime(dates, units, calendar)
# do it now only for cftime-based flow
# we already covered for this in pandas-based flow
num = cast_to_int_if_safe(num)

num = cast_to_int_if_safe(num)
return (num, units, calendar)


def encode_cf_timedelta(timedeltas, units: str | None = None) -> tuple[np.ndarray, str]:
if units is None:
units = infer_timedelta_units(timedeltas)
data_units = infer_timedelta_units(timedeltas)

np_unit = _netcdf_to_numpy_timeunit(units)
num = 1.0 * timedeltas / np.timedelta64(1, np_unit)
num = np.where(pd.isnull(timedeltas), np.nan, num)
num = cast_to_int_if_safe(num)
if units is None:
units = data_units

time_delta = _time_units_to_timedelta64(units)
time_deltas = pd.TimedeltaIndex(timedeltas.ravel())

# retrieve needed units to faithfully encode to int64
needed_units = data_units
if data_units != units:
needed_units = _infer_time_units_from_diff(np.unique(time_deltas.dropna()))

# needed time delta to encode faithfully to int64
needed_time_delta = _time_units_to_timedelta64(needed_units)
if time_delta <= needed_time_delta:
# calculate int64 floor division
# to preserve integer dtype if possible
num = time_deltas // time_delta.astype(np.int64)
num = num.astype(np.int64, copy=False)
else:
emit_user_level_warning(
f"Timedeltas can't be serialized faithfully with requested units {units!r}. "
f"Resolution of {needed_units!r} needed. "
f"Serializing timedeltas to floating point."
)
num = time_deltas / time_delta
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
num = num.values.reshape(timedeltas.shape)
return (num, units)


Expand All @@ -702,9 +770,10 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
) or contains_cftime_datetimes(variable):
dims, data, attrs, encoding = unpack_for_encoding(variable)

(data, units, calendar) = encode_cf_datetime(
data, encoding.pop("units", None), encoding.pop("calendar", None)
)
units = encoding.pop("units", None)
calendar = encoding.pop("calendar", None)
(data, units, calendar) = encode_cf_datetime(data, units, calendar)

safe_setitem(attrs, "units", units, name=name)
safe_setitem(attrs, "calendar", calendar, name=name)

Expand Down
38 changes: 35 additions & 3 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,21 @@ def _apply_mask(
return np.where(condition, decoded_fill_value, data)


def _is_time_like(units):
# test for time-like
time_strings = [
"since",
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
]
return any(tstr in str(units) for tstr in time_strings)


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

Expand All @@ -236,19 +251,32 @@ def encode(self, variable: Variable, name: T_Name = None):
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

# special case DateTime to properly handle NaT
is_time_like = _is_time_like(attrs.get("units"))

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
data = duck_array_ops.fillna(data, fill_value)
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

if mv_exists:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and not fv_exists:
data = duck_array_ops.fillna(data, fill_value)
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

return Variable(dims, data, attrs, encoding, fastpath=True)

Expand All @@ -275,7 +303,11 @@ def decode(self, variable: Variable, name: T_Name = None):
stacklevel=3,
)

dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)

if encoded_fill_values:
transform = partial(
Expand Down
2 changes: 0 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
from xarray.core.options import set_options
from xarray.core.pycompat import array_type
from xarray.tests import (
arm_xfail,
assert_allclose,
assert_array_equal,
assert_equal,
Expand Down Expand Up @@ -526,7 +525,6 @@ def test_roundtrip_string_encoded_characters(self) -> None:
assert_identical(expected, actual)
assert actual["x"].encoding["_Encoding"] == "ascii"

@arm_xfail
def test_roundtrip_numpy_datetime_data(self) -> None:
times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"])
expected = Dataset({"t": ("t", times), "t0": times[0]})
Expand Down
Loading
Loading