Skip to content

Fix critical np.timedelta64 encoding bugs #10469

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 2, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Unify timedelta64 coding logic between the old and new approaches
Always write a dtype attribute to disk regardless of how the timedeltas were
decoded.
  • Loading branch information
spencerkclark committed Jul 1, 2025
commit bdda733a569713f640f6ae65900fce79d6d1eb53
9 changes: 5 additions & 4 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,11 @@ Bug fixes
(:pull:`10352`). By `Spencer Clark <https://github.com/spencerkclark>`_.
- Avoid unsafe casts from float to unsigned int in CFMaskCoder (:issue:`9815`, :pull:`9964`).
By ` Elliott Sales de Andrade <https://github.com/QuLogic>`_.
- Fix attribute overwriting bug when decoding literally encoded
:py:class:`numpy.timedelta64` values from disk (:issue:`10468`,
:pull:`10469`). By `Spencer Clark <https://github.com/spencerkclark>`_.
- Fix default ``"_FillValue"`` dtype coercion bug when literally encoding
- Fix attribute overwriting bug when decoding encoded
:py:class:`numpy.timedelta64` values from disk with a dtype attribute
(:issue:`10468`, :pull:`10469`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
- Fix default ``"_FillValue"`` dtype coercion bug when encoding
:py:class:`numpy.timedelta64` values to an on-disk format that only supports
32-bit integers (:issue:`10466`, :pull:`10469`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
Expand Down
20 changes: 3 additions & 17 deletions xarray/backends/netcdf3.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def encode_nc3_attrs(attrs):
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}


def _maybe_prepare_times(var, name=None):
def _maybe_prepare_times(var):
# checks for integer-based time-like and
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
# this keeps backwards compatibility
Expand All @@ -112,21 +112,7 @@ def _maybe_prepare_times(var, name=None):
if data.dtype.kind in "iu":
units = var.attrs.get("units", None)
if units is not None and coding.variables._is_time_like(units):
default_int64_fill_value = np.iinfo(np.int64).min
default_int32_fill_value = np.iinfo(np.int32).min
mask = data == default_int64_fill_value

if var.attrs.get("_FillValue") == default_int64_fill_value:
if (data == default_int32_fill_value).any():
raise ValueError(
f"Could not safely coerce default int64 _FillValue "
f"({default_int64_fill_value}) to the analogous int32 "
f"value ({default_int32_fill_value}), since it "
f"already exists as non-missing within variable "
f"{name!r}. Try explicitly setting "
f"encoding['_FillValue'] to another int32 value."
)
var.attrs["_FillValue"] = default_int32_fill_value
mask = data == np.iinfo(np.int64).min
if mask.any():
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
return data
Expand All @@ -138,7 +124,7 @@ def encode_nc3_variable(var, name=None):
coding.strings.CharacterArrayCoder(),
]:
var = coder.encode(var, name=name)
data = _maybe_prepare_times(var, name=name)
data = _maybe_prepare_times(var)
data = coerce_nc3_dtype(data)
attrs = encode_nc3_attrs(var.attrs)
return Variable(var.dims, data, attrs, var.encoding)
Expand Down
157 changes: 67 additions & 90 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,43 @@ def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
return isinstance(dtype, str) and dtype.startswith("timedelta64")


def resolve_time_unit_from_attrs_dtype(
attrs_dtype: str, name: T_Name
) -> PDDatetimeUnitOptions:
dtype = np.dtype(attrs_dtype)
resolution, _ = np.datetime_data(dtype)
resolution = cast(NPDatetimeUnitOptions, resolution)
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
time_unit = cast(PDDatetimeUnitOptions, "s")
message = (
f"Following pandas, xarray only supports decoding to timedelta64 "
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
f"values for variable {name!r} have a resolution of "
f"{resolution!r}. Attempting to decode to a resolution of 's'. "
f"Note, depending on the encoded values, this may lead to an "
f"OverflowError. Additionally, data will not be identically round "
f"tripped; xarray will choose an encoding dtype of "
f"'timedelta64[s]' when re-encoding."
)
emit_user_level_warning(message)
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
time_unit = cast(PDDatetimeUnitOptions, "ns")
message = (
f"Following pandas, xarray only supports decoding to timedelta64 "
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
f"values for variable {name!r} have a resolution of "
f"{resolution!r}. Attempting to decode to a resolution of 'ns'. "
f"Note, depending on the encoded values, this may lead to loss of "
f"precision. Additionally, data will not be identically round "
f"tripped; xarray will choose an encoding dtype of "
f"'timedelta64[ns]' when re-encoding."
)
emit_user_level_warning(message)
else:
time_unit = cast(PDDatetimeUnitOptions, resolution)
return time_unit


class CFTimedeltaCoder(VariableCoder):
"""Coder for CF Timedelta coding.

Expand All @@ -1430,7 +1467,7 @@ class CFTimedeltaCoder(VariableCoder):

def __init__(
self,
time_unit: PDDatetimeUnitOptions = "ns",
time_unit: PDDatetimeUnitOptions | None = None,
decode_via_units: bool = True,
decode_via_dtype: bool = True,
) -> None:
Expand All @@ -1442,45 +1479,18 @@ def __init__(
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if np.issubdtype(variable.data.dtype, np.timedelta64):
dims, data, attrs, encoding = unpack_for_encoding(variable)
has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
dtype = encoding.get("dtype", None)
units = encoding.pop("units", None)
dtype = encoding.get("dtype", None)
units = encoding.pop("units", None)

# in the case of packed data we need to encode into
# float first, the correct dtype will be established
# via CFScaleOffsetCoder/CFMaskCoder
if "add_offset" in encoding or "scale_factor" in encoding:
dtype = data.dtype if data.dtype.kind == "f" else "float64"
# in the case of packed data we need to encode into
# float first, the correct dtype will be established
# via CFScaleOffsetCoder/CFMaskCoder
if "add_offset" in encoding or "scale_factor" in encoding:
dtype = data.dtype if data.dtype.kind == "f" else "float64"

else:
resolution, _ = np.datetime_data(variable.dtype)
dtype = np.int64
attrs_dtype = f"timedelta64[{resolution}]"
units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
# Remove dtype encoding if it exists to prevent it from
# interfering downstream in NonStringCoder.
encoding.pop("dtype", None)

if any(
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
):
raise ValueError(
f"Specifying 'add_offset' or 'scale_factor' is not "
f"supported when encoding the timedelta64 values of "
f"variable {name!r} with xarray's new default "
f"timedelta64 encoding approach. To encode {name!r} "
f"with xarray's previous timedelta64 encoding "
f"approach, which supports the 'add_offset' and "
f"'scale_factor' parameters, additionally set "
f"encoding['units'] to a unit of time, e.g. "
f"'seconds'. To proceed with encoding of {name!r} "
f"via xarray's new approach, remove any encoding "
f"entries for 'add_offset' or 'scale_factor'."
)
if "_FillValue" not in encoding and "missing_value" not in encoding:
encoding["_FillValue"] = np.iinfo(np.int64).min
resolution, _ = np.datetime_data(variable.dtype)
attrs_dtype = f"timedelta64[{resolution}]"
safe_setitem(attrs, "dtype", attrs_dtype, name=name)

data, units = encode_cf_timedelta(data, units, dtype)
safe_setitem(attrs, "units", units, name=name)
Expand All @@ -1499,57 +1509,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
):
dims, data, attrs, encoding = unpack_for_decoding(variable)
units = pop_to(attrs, encoding, "units")
if is_dtype_decodable and self.decode_via_dtype:
if any(
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
):
raise ValueError(
f"Decoding timedelta64 values via dtype is not "
f"supported when 'add_offset', or 'scale_factor' are "
f"present in encoding. Check the encoding parameters "
f"of variable {name!r}."
)
# Overwrite the on-disk dtype encoding, which is numeric, with
# the dtype attribute stored on disk, which corresponds to
# a timedelta64 dtype.
encoding["dtype"] = attrs.pop("dtype")
dtype = np.dtype(encoding["dtype"])
resolution, _ = np.datetime_data(dtype)
resolution = cast(NPDatetimeUnitOptions, resolution)
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
time_unit = cast(PDDatetimeUnitOptions, "s")
dtype = np.dtype("timedelta64[s]")
message = (
f"Following pandas, xarray only supports decoding to "
f"timedelta64 values with a resolution of 's', 'ms', "
f"'us', or 'ns'. Encoded values for variable {name!r} "
f"have a resolution of {resolution!r}. Attempting to "
f"decode to a resolution of 's'. Note, depending on "
f"the encoded values, this may lead to an "
f"OverflowError. Additionally, data will not be "
f"identically round tripped; xarray will choose an "
f"encoding dtype of 'timedelta64[s]' when re-encoding."
)
emit_user_level_warning(message)
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
time_unit = cast(PDDatetimeUnitOptions, "ns")
dtype = np.dtype("timedelta64[ns]")
message = (
f"Following pandas, xarray only supports decoding to "
f"timedelta64 values with a resolution of 's', 'ms', "
f"'us', or 'ns'. Encoded values for variable {name!r} "
f"have a resolution of {resolution!r}. Attempting to "
f"decode to a resolution of 'ns'. Note, depending on "
f"the encoded values, this may lead to loss of "
f"precision. Additionally, data will not be "
f"identically round tripped; xarray will choose an "
f"encoding dtype of 'timedelta64[ns]' "
f"when re-encoding."
)
emit_user_level_warning(message)
if is_dtype_decodable:
attrs_dtype = attrs.pop("dtype")
if self.time_unit is None:
time_unit = resolve_time_unit_from_attrs_dtype(attrs_dtype, name)
else:
time_unit = cast(PDDatetimeUnitOptions, resolution)
elif self.decode_via_units:
time_unit = self.time_unit
else:
if self._emit_decode_timedelta_future_warning:
emit_user_level_warning(
"In a future version, xarray will not decode "
Expand All @@ -1567,8 +1533,19 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
"'CFTimedeltaCoder' instance.",
FutureWarning,
)
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
time_unit = self.time_unit
if self.time_unit is None:
time_unit = cast(PDDatetimeUnitOptions, "ns")
else:
time_unit = self.time_unit

# Handle edge case that decode_via_dtype=False and
# decode_via_units=True, and timedeltas were encoded with a
# dtype attribute. We need to remove the dtype attribute
# to prevent an error during round tripping.
if has_timedelta_dtype:
attrs.pop("dtype")

dtype = np.dtype(f"timedelta64[{time_unit}]")
transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
data = lazy_elemwise_func(data, transform, dtype=dtype)
return Variable(dims, data, attrs, encoding, fastpath=True)
Expand Down
17 changes: 5 additions & 12 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from xarray.conventions import encode_dataset_coordinates
from xarray.core import indexing
from xarray.core.options import set_options
from xarray.core.types import PDDatetimeUnitOptions
from xarray.core.utils import module_available
from xarray.namedarray.pycompat import array_type
from xarray.tests import (
Expand Down Expand Up @@ -351,16 +352,6 @@ def test_dtype_coercion_error(self) -> None:
with pytest.raises(ValueError, match="could not safely cast"):
ds.to_netcdf(path, format=format)

def test_literal_timedelta_fill_value_coercion_error(self) -> None:
for format in self.netcdf3_formats:
timedeltas = np.array(
[0, np.iinfo(np.int32).min, np.iinfo(np.int64).min]
).astype("timedelta64[s]")
ds = Dataset({"timedeltas": ("timedeltas", timedeltas)})
with create_tmp_file(allow_cleanup_failure=False) as path:
with pytest.raises(ValueError, match="_FillValue"):
ds.to_netcdf(path, format=format)


class DatasetIOBase:
engine: T_NetcdfEngine | None = None
Expand Down Expand Up @@ -652,8 +643,10 @@ def test_roundtrip_timedelta_data(self) -> None:
) as actual:
assert_identical(expected, actual)

def test_roundtrip_literal_timedelta_data(self) -> None:
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore]
def test_roundtrip_timedelta_data_via_dtype(
self, time_unit: PDDatetimeUnitOptions
) -> None:
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit(time_unit) # type: ignore[arg-type, unused-ignore]
expected = Dataset(
{"td": ("td", time_deltas), "td0": time_deltas[0].to_numpy()}
)
Expand Down
Loading
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy