Skip to content

Allow setting fill_value on Zarr format 3 arrays #10161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 26, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions doc/user-guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,21 @@ reads. Because this fall-back option is so much slower, xarray issues a
instead of falling back to try reading non-consolidated metadata.


Fill Values
~~~~~~~~~~~

Zarr arrays have a ``fill_value`` that is used for chunks that were never written to disk.
For the Zarr version 2 format, Xarray will set ``fill_value`` to be equal to the CF/NetCDF ``"_FillValue"``.
This is ``np.nan`` by default for floats, and unset otherwise. Note that the Zarr library will set a
default ``fill_value`` if not specified (usually ``0``).

For the Zarr version 3 format, ``_FillValue`` and ```fill_value`` are decoupled.
So you can set ``fill_value`` in ``encoding`` as usual.

Note that at read-time, you can control whether ``_FillValue`` is masked using the
``mask_and_scale`` kwarg.


.. _io.kerchunk:

Kerchunk
Expand Down
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ v2025.03.1 (unreleased)
New Features
~~~~~~~~~~~~

- Allow setting a ``fill_value`` for Zarr format 3 arrays. Specify ``fill_value`` in ``encoding`` as usual.
(:issue:`10064`). By `Deepak Cherian <https://github.com/dcherian>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
14 changes: 12 additions & 2 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,7 @@ def extract_zarr_variable_encoding(
raise_on_invalid=False,
name=None,
*,
zarr_format: ZarrFormat,
safe_chunks=True,
region=None,
mode=None,
Expand All @@ -443,6 +444,7 @@ def extract_zarr_variable_encoding(
region: tuple[slice, ...], optional
mode: str, optional
shape: tuple[int, ...], optional
zarr_format: Literal[2,3]
Returns
-------
encoding : dict
Expand All @@ -463,16 +465,23 @@ def extract_zarr_variable_encoding(
"cache_metadata",
"write_empty_chunks",
}
if zarr_format == 3:
valid_encodings.add("fill_value")

for k in safe_to_drop:
if k in encoding:
del encoding[k]

if raise_on_invalid:
invalid = [k for k in encoding if k not in valid_encodings]
if "fill_value" in invalid and zarr_format == 2:
msg = " Use `_FillValue` to set the Zarr array `fill_value`"
else:
msg = ""

if invalid:
raise ValueError(
f"unexpected encoding parameters for zarr backend: {invalid!r}"
f"unexpected encoding parameters for zarr backend: {invalid!r}." + msg
)
else:
for k in list(encoding):
Expand Down Expand Up @@ -1147,7 +1156,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
if self._use_zarr_fill_value_as_mask:
fill_value = attrs.pop("_FillValue", None)
else:
fill_value = None
fill_value = v.encoding.pop("fill_value", None)
if "_FillValue" in attrs:
# replace with encoded fill value
fv = attrs.pop("_FillValue")
Expand Down Expand Up @@ -1198,6 +1207,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
region=region,
mode=self._mode,
shape=zarr_shape,
zarr_format=self.zarr_group.metadata.zarr_format,
)

if self._mode == "w" or name not in existing_keys:
Expand Down
64 changes: 60 additions & 4 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import tempfile
import uuid
import warnings
from collections import ChainMap
from collections.abc import Generator, Iterator, Mapping
from contextlib import ExitStack
from io import BytesIO
Expand Down Expand Up @@ -661,7 +662,7 @@
{"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])}
)

with self.roundtrip(original) as actual:

Check failure on line 665 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

TestZarrWriteEmpty.test_roundtrip_coordinates[2] AttributeError

Check failure on line 665 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

TestZarrWriteEmpty.test_roundtrip_coordinates[2] AttributeError
assert_identical(original, actual)

original["foo"].encoding["coordinates"] = "y"
Expand All @@ -676,7 +677,7 @@
original = Dataset(
{"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])}
)
with self.roundtrip(original) as actual:

Check failure on line 680 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

TestZarrWriteEmpty.test_roundtrip_global_coordinates[2] AttributeError

Check failure on line 680 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

TestZarrWriteEmpty.test_roundtrip_global_coordinates[2] AttributeError
assert_identical(original, actual)

# test that global "coordinates" is as expected
Expand All @@ -693,13 +694,13 @@
original = Dataset(coords={"x": 0, "y z": 1})
expected = Dataset({"y z": 1}, {"x": 0})
with pytest.warns(SerializationWarning):
with self.roundtrip(original) as actual:

Check failure on line 697 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

TestZarrWriteEmpty.test_roundtrip_coordinates_with_space[2] AttributeError

Check failure on line 697 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

TestZarrWriteEmpty.test_roundtrip_coordinates_with_space[2] AttributeError
assert_identical(expected, actual)

def test_roundtrip_boolean_dtype(self) -> None:
original = create_boolean_data()
assert original["x"].dtype == "bool"
with self.roundtrip(original) as actual:

Check failure on line 703 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

TestZarrWriteEmpty.test_roundtrip_boolean_dtype[2] AttributeError

Check failure on line 703 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

TestZarrWriteEmpty.test_roundtrip_boolean_dtype[2] AttributeError
assert_identical(original, actual)
assert actual["x"].dtype == "bool"
# this checks for preserving dtype during second roundtrip
Expand Down Expand Up @@ -3343,6 +3344,61 @@
observed_keys_2 = sorted(zstore_mut.array_keys())
assert observed_keys_2 == sorted(array_keys + [new_key])

@requires_dask
@pytest.mark.parametrize("dtype", [int, float])
def test_zarr_fill_value_setting(self, dtype):
# When zarr_format=2, _FillValue sets fill_value
# When zarr_format=3, fill_value is set independently
# We test this by writing a dask array with compute=False,
# on read we should receive chunks filled with `fill_value`
fv = -1
ds = xr.Dataset(
{"foo": ("x", dask.array.from_array(np.array([0, 0, 0], dtype=dtype)))}
)
expected = xr.Dataset({"foo": ("x", [fv] * 3)})

zarr_format_2 = (
has_zarr_v3 and zarr.config.get("default_zarr_format") == 2
) or not has_zarr_v3
if zarr_format_2:
attr = "_FillValue"
expected.foo.attrs[attr] = fv
else:
attr = "fill_value"
if dtype is float:
# for floats, Xarray inserts a default `np.nan`
expected.foo.attrs["_FillValue"] = np.nan
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this assume that np.nan is serializable to the attributes field using the same JSON encoding that we use for the fill_value field? Because the spec doesn't cover how scalars should be encoded in attributes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I think so:

xarray/xarray/backends/zarr.py

Lines 1150 to 1155 in cdebbf5

fill_value = None
if "_FillValue" in attrs:
# replace with encoded fill value
fv = attrs.pop("_FillValue")
if fv is not None:
attrs["_FillValue"] = FillValueCoder.encode(fv, dtype)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since the zarr spec only defines a special JSON encoding for nan in the specific context of the fill_value field, I would recommend handling the JSON serialization explicitly here.

Also, In zarr-developers/zarr-python#2874 I remove the behavior where all numpy scalars get special serialization to JSON (instead, the dtype object is responsible for that). But it seems like we will break xarray if we move forward with that change.

Copy link
Contributor

@d-v-b d-v-b Mar 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked what zarr v3 does today -- although we apply special JSON encoding to any nan, +infinity, -inifinity when writing JSON (e.g., {"foo": np.nan} -> {"foo": "NaN"}), we only apply special decoding when reading the fill value. Special nan values in attributes will not get special decoding. Is xarray basically handling that special decoding here? If so, we don't have anything to worry about w.r.t. the new dtype stuff.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes looks like Ryan added base64 encode/decode.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great if we could take that custom attribute encoding decoding stuff out of Xarray and replace it with @d-v-b's new ZarrDType machinery.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have stand-alone functions that handle this, but they are currently not public API. Would be happy to make them public though!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

("this" meaning "convert special floats to / from JSON")


open_kwargs = {"mask_and_scale": False, "consolidated": False}
save_kwargs = dict(compute=False, consolidated=False)
with self.roundtrip(
ds,
save_kwargs=ChainMap(save_kwargs, dict(encoding={"foo": {attr: fv}})),
open_kwargs=open_kwargs,
) as actual:
assert_identical(actual, expected)

ds.foo.encoding[attr] = fv
with self.roundtrip(
ds, save_kwargs=save_kwargs, open_kwargs=open_kwargs
) as actual:
assert_identical(actual, expected)

if zarr_format_2:
ds = ds.drop_encoding()
with pytest.raises(ValueError, match="_FillValue"):
with self.roundtrip(
ds,
save_kwargs=ChainMap(
save_kwargs, dict(encoding={"foo": {"fill_value": fv}})
),
open_kwargs=open_kwargs,
):
pass
# TODO: this doesn't fail because of the
# ``raise_on_invalid=vn in check_encoding_set`` line in zarr.py
# ds.foo.encoding["fill_value"] = fv


@requires_zarr
@pytest.mark.skipif(
Expand Down Expand Up @@ -3720,7 +3776,7 @@
)

with self.create_zarr_target() as store, patched as mock:
ds.to_zarr(store, mode="w")

Check failure on line 3779 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrWriteEmpty.test_avoid_excess_metadata_calls[2] AttributeError

# We expect this to request array metadata information, so call_count should be == 1,
xrds = xr.open_zarr(store)
Expand Down Expand Up @@ -5827,23 +5883,23 @@
@requires_zarr
def test_extract_zarr_variable_encoding() -> None:
var = xr.Variable("x", [1, 2])
actual = backends.zarr.extract_zarr_variable_encoding(var)
actual = backends.zarr.extract_zarr_variable_encoding(var, zarr_format=3)
assert "chunks" in actual
assert actual["chunks"] == ("auto" if has_zarr_v3 else None)

var = xr.Variable("x", [1, 2], encoding={"chunks": (1,)})
actual = backends.zarr.extract_zarr_variable_encoding(var)
actual = backends.zarr.extract_zarr_variable_encoding(var, zarr_format=3)
assert actual["chunks"] == (1,)

# does not raise on invalid
var = xr.Variable("x", [1, 2], encoding={"foo": (1,)})
actual = backends.zarr.extract_zarr_variable_encoding(var)
actual = backends.zarr.extract_zarr_variable_encoding(var, zarr_format=3)

# raises on invalid
var = xr.Variable("x", [1, 2], encoding={"foo": (1,)})
with pytest.raises(ValueError, match=r"unexpected encoding parameters"):
actual = backends.zarr.extract_zarr_variable_encoding(
var, raise_on_invalid=True
var, raise_on_invalid=True, zarr_format=3
)


Expand All @@ -5862,7 +5918,7 @@

m = fsspec.filesystem("memory")
mm = m.get_mapper("out1.zarr")
ds.to_zarr(mm) # old interface

Check failure on line 5921 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

test_open_fsspec AttributeError

Check failure on line 5921 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

test_open_fsspec AttributeError
ds0 = ds.copy()
# pd.to_timedelta returns ns-precision, but the example data is in second precision
# so we need to fix this
Expand Down Expand Up @@ -5934,7 +5990,7 @@
}
)
ds["test"].encoding["chunks"] = encoded_chunks
ds.to_zarr(tmp_path / "test.zarr")

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

test_open_dataset_chunking_zarr[auto] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

test_open_dataset_chunking_zarr[-1] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

test_open_dataset_chunking_zarr[chunks2] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

test_open_dataset_chunking_zarr[chunks3] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

test_open_dataset_chunking_zarr[chunks4] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

test_open_dataset_chunking_zarr[auto] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

test_open_dataset_chunking_zarr[-1] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

test_open_dataset_chunking_zarr[chunks2] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

test_open_dataset_chunking_zarr[chunks3] AttributeError

Check failure on line 5993 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

test_open_dataset_chunking_zarr[chunks4] AttributeError

with dask.config.set({"array.chunk-size": "1MiB"}):
expected = ds.chunk(chunks)
Expand Down Expand Up @@ -6148,7 +6204,7 @@
def test_zarr_closing_internal_zip_store():
store_name = "tmp.zarr.zip"
original_da = DataArray(np.arange(12).reshape((3, 4)))
original_da.to_zarr(store_name, mode="w")

Check failure on line 6207 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

test_zarr_closing_internal_zip_store[2] AttributeError

with open_dataarray(store_name, engine="zarr") as loaded_da:
assert_identical(original_da, loaded_da)
Expand Down Expand Up @@ -6199,7 +6255,7 @@
],
)
def test_zarr_region_auto(self, region):
with self.create() as (target, ds):

Check failure on line 6258 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_region_auto[2-full-auto] AttributeError

Check failure on line 6258 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_region_auto[2-mixed-auto] AttributeError
ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8))
self.save(target, ds_region, region=region)
ds_updated = xr.open_zarr(target)
Expand All @@ -6209,7 +6265,7 @@
assert_identical(ds_updated, expected)

def test_zarr_region_auto_noncontiguous(self):
with self.create() as (target, ds):

Check failure on line 6268 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_region_auto_noncontiguous[2] AttributeError
with pytest.raises(ValueError):
self.save(target, ds.isel(x=[0, 2, 3], y=[5, 6]), region="auto")

Expand All @@ -6222,7 +6278,7 @@
region: Mapping[str, slice] | Literal["auto"]
region_slice = dict(x=slice(2, 4), y=slice(6, 8))

with self.create() as (target, ds):

Check failure on line 6281 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_region_index_write[2] AttributeError
ds_region = 1 + ds.isel(region_slice)
for region in [region_slice, "auto"]: # type: ignore[assignment]
with patch.object(
Expand All @@ -6241,7 +6297,7 @@
assert "y" not in written_variables

def test_zarr_region_append(self):
with self.create() as (target, ds):

Check failure on line 6300 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_region_append[2] AttributeError
x_new = np.arange(40, 70, 10)
data_new = np.ones((3, 10))
ds_new = xr.Dataset(
Expand All @@ -6261,7 +6317,7 @@
self.save(target, ds_new, mode="a", append_dim="x", region="auto")

def test_zarr_region(self):
with self.create() as (target, ds):

Check failure on line 6320 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_region[2] AttributeError
ds_transposed = ds.transpose("y", "x")
ds_region = 1 + ds_transposed.isel(x=[0], y=[0])
self.save(target, ds_region, region={"x": slice(0, 1), "y": slice(0, 1)})
Expand All @@ -6281,7 +6337,7 @@
)

with self.create_zarr_target() as target:
self.save(target, ds.chunk(5), compute=False, mode="w")

Check failure on line 6340 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_region_chunk_partial[2] AttributeError
with pytest.raises(ValueError):
for r in range(ds.sizes["a"]):
self.save(
Expand All @@ -6308,7 +6364,7 @@
)

with self.create_zarr_target() as target:
self.save(target, da, mode="w", encoding={"foo": {"chunks": (5, 5, 1)}})

Check failure on line 6367 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

TestZarrRegionAuto.test_zarr_append_chunk_partial[2] AttributeError

with pytest.raises(ValueError, match="encoding was provided"):
self.save(
Expand Down
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy