From 9503c71c0d49a6e9f103dd38806596c274822c98 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 4 Dec 2024 21:34:29 +0100 Subject: [PATCH 01/38] cache array keys on ZarrStore instances --- xarray/backends/zarr.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index cb3ab375c31..8f6e24965d8 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -612,6 +612,8 @@ class ZarrStore(AbstractWritableDataStore): "_use_zarr_fill_value_as_mask", "_write_empty", "_write_region", + "_cache_array_keys", + "_cache", "zarr_group", ) @@ -633,6 +635,7 @@ def open_store( zarr_format=None, use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, + cache_array_keys: bool = True, ): ( zarr_group, @@ -664,6 +667,7 @@ def open_store( write_empty, close_store_on_close, use_zarr_fill_value_as_mask, + cache_array_keys=cache_array_keys, ) for group in group_paths } @@ -686,6 +690,7 @@ def open_group( zarr_format=None, use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, + cache_array_keys: bool = True, ): ( zarr_group, @@ -716,6 +721,7 @@ def open_group( write_empty, close_store_on_close, use_zarr_fill_value_as_mask, + cache_array_keys ) def __init__( @@ -729,6 +735,7 @@ def __init__( write_empty: bool | None = None, close_store_on_close: bool = False, use_zarr_fill_value_as_mask=None, + cache_array_keys: bool = True ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -743,6 +750,10 @@ def __init__( self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask + self._cache = {} + if cache_array_keys: + self._cache['array_keys'] = None + @property def ds(self): # TODO: consider deprecating this in favor of zarr_group @@ -802,6 +813,17 @@ def get_variables(self): (k, self.open_store_variable(k, v)) for k, v in self.zarr_group.arrays() ) + def get_array_keys(self) -> tuple[str, ...]: + _key = 'array_keys' + if _key not in self._cache: + result = self.zarr_group.array_keys() + elif self._cache[_key] is None: + result = self.zarr_group.array_keys() + self._cache[_key] = result + else: + result = self._cache[_key] + return tuple(result) + def get_attrs(self): return { k: v @@ -881,7 +903,7 @@ def store( existing_keys = {} existing_variable_names = {} else: - existing_keys = tuple(self.zarr_group.array_keys()) + existing_keys = self.get_array_keys() existing_variable_names = { vn for vn in variables if _encode_variable_name(vn) in existing_keys } @@ -1059,7 +1081,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No dimensions. """ - existing_keys = tuple(self.zarr_group.array_keys()) + existing_keys = self.get_array_keys() is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3 for vn, v in variables.items(): @@ -1251,7 +1273,7 @@ def _validate_and_autodetect_region(self, ds: Dataset) -> Dataset: def _validate_encoding(self, encoding) -> None: if encoding and self._mode in ["a", "a-", "r+"]: - existing_var_names = set(self.zarr_group.array_keys()) + existing_var_names = self.get_array_keys() for var_name in existing_var_names: if var_name in encoding: raise ValueError( From 94b48ac2eb1b7272fd0cb0be7c5ad65cfa074c52 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Dec 2024 14:18:46 +0100 Subject: [PATCH 02/38] refactor get_array_keys --- xarray/backends/zarr.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 8f6e24965d8..3125fc79686 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -814,15 +814,15 @@ def get_variables(self): ) def get_array_keys(self) -> tuple[str, ...]: - _key = 'array_keys' - if _key not in self._cache: - result = self.zarr_group.array_keys() - elif self._cache[_key] is None: - result = self.zarr_group.array_keys() - self._cache[_key] = result + key = 'array_keys' + if key not in self._cache: + result = tuple(self.zarr_group.array_keys()) + elif self._cache[key] is None: + result = tuple(self.zarr_group.array_keys()) + self._cache[key] = result else: - result = self._cache[_key] - return tuple(result) + result = self._cache[key] + return result def get_attrs(self): return { @@ -1303,6 +1303,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + cache_array_keys: bool = False, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1456,6 +1457,7 @@ def open_zarr( "storage_options": storage_options, "zarr_version": zarr_version, "zarr_format": zarr_format, + "cache_array_keys": cache_array_keys, } ds = open_dataset( @@ -1527,6 +1529,7 @@ def open_dataset( store=None, engine=None, use_zarr_fill_value_as_mask=None, + cache_array_keys: bool = True ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: @@ -1542,6 +1545,7 @@ def open_dataset( zarr_version=zarr_version, use_zarr_fill_value_as_mask=None, zarr_format=zarr_format, + cache_array_keys=cache_array_keys, ) store_entrypoint = StoreBackendEntrypoint() From 2dacb2acacbc6f10579767d0d8ab969798bb6292 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Dec 2024 14:19:00 +0100 Subject: [PATCH 03/38] add test for get_array_keys --- xarray/tests/test_backends.py | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ff254225321..b178b56a47d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -48,6 +48,7 @@ ) from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint +from xarray.backends.zarr import ZarrStore from xarray.coding.cftime_offsets import cftime_range from xarray.coding.strings import check_vlen_dtype, create_vlen_dtype from xarray.coding.variables import SerializationWarning @@ -3261,6 +3262,41 @@ def test_chunked_cftime_datetime(self) -> None: assert original[name].chunks == actual_var.chunks assert original.chunks == actual.chunks + @pytest.mark.parametrize('cache_array_keys', [True, False]) + def test_get_array_keys(self, cache_array_keys: bool) -> None: + """ + Ensure that if `ZarrStore` is created with `cache_array_keys` set to `True`, + a `ZarrStore.get_array_keys` only invokes the `array_keys` function on the + `ZarrStore.zarr_group` instance once, and that the results of that call are cached. + + Otherwise, `ZarrStore.get_array_keys` instance should invoke the `array_keys` + each time it is called. + """ + with self.create_zarr_target() as store_target: + zstore = backends.ZarrStore.open_group( + store_target, + mode='w', + cache_array_keys=cache_array_keys) + + # ensure that the keys are sorted + array_keys = sorted(('foo', 'bar')) + + # create some arrays + for ak in array_keys: + zstore.zarr_group.create(name=ak, shape=(1,), dtype='uint8') + + observed_keys_0 = sorted(zstore.get_array_keys()) + assert observed_keys_0 == array_keys + + # create a new array + new_key = 'baz' + zstore.zarr_group.create(name=new_key, shape=(1,), dtype='uint8') + observed_keys_1 = sorted(zstore.get_array_keys()) + + if cache_array_keys: + assert observed_keys_1 == array_keys + else: + assert observed_keys_1 == sorted(array_keys + [new_key]) @requires_zarr @pytest.mark.skipif( @@ -6526,3 +6562,4 @@ def test_h5netcdf_storage_options() -> None: storage_options={"skip_instance_cache": False}, ) assert_identical(xr.concat([ds1, ds2], dim="time"), ds) + From c7cfacf531a94d85acf8757e57e4ff0a11addb0a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:45:09 +0000 Subject: [PATCH 04/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 14 +++++++------- xarray/tests/test_backends.py | 22 ++++++++++------------ 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3125fc79686..7072fe1ceb4 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -602,6 +602,8 @@ class ZarrStore(AbstractWritableDataStore): __slots__ = ( "_append_dim", + "_cache", + "_cache_array_keys", "_close_store_on_close", "_consolidate_on_close", "_group", @@ -612,8 +614,6 @@ class ZarrStore(AbstractWritableDataStore): "_use_zarr_fill_value_as_mask", "_write_empty", "_write_region", - "_cache_array_keys", - "_cache", "zarr_group", ) @@ -721,7 +721,7 @@ def open_group( write_empty, close_store_on_close, use_zarr_fill_value_as_mask, - cache_array_keys + cache_array_keys, ) def __init__( @@ -735,7 +735,7 @@ def __init__( write_empty: bool | None = None, close_store_on_close: bool = False, use_zarr_fill_value_as_mask=None, - cache_array_keys: bool = True + cache_array_keys: bool = True, ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -752,7 +752,7 @@ def __init__( self._cache = {} if cache_array_keys: - self._cache['array_keys'] = None + self._cache["array_keys"] = None @property def ds(self): @@ -814,7 +814,7 @@ def get_variables(self): ) def get_array_keys(self) -> tuple[str, ...]: - key = 'array_keys' + key = "array_keys" if key not in self._cache: result = tuple(self.zarr_group.array_keys()) elif self._cache[key] is None: @@ -1529,7 +1529,7 @@ def open_dataset( store=None, engine=None, use_zarr_fill_value_as_mask=None, - cache_array_keys: bool = True + cache_array_keys: bool = True, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b178b56a47d..af9ad58346e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3262,11 +3262,11 @@ def test_chunked_cftime_datetime(self) -> None: assert original[name].chunks == actual_var.chunks assert original.chunks == actual.chunks - @pytest.mark.parametrize('cache_array_keys', [True, False]) + @pytest.mark.parametrize("cache_array_keys", [True, False]) def test_get_array_keys(self, cache_array_keys: bool) -> None: """ - Ensure that if `ZarrStore` is created with `cache_array_keys` set to `True`, - a `ZarrStore.get_array_keys` only invokes the `array_keys` function on the + Ensure that if `ZarrStore` is created with `cache_array_keys` set to `True`, + a `ZarrStore.get_array_keys` only invokes the `array_keys` function on the `ZarrStore.zarr_group` instance once, and that the results of that call are cached. Otherwise, `ZarrStore.get_array_keys` instance should invoke the `array_keys` @@ -3274,23 +3274,22 @@ def test_get_array_keys(self, cache_array_keys: bool) -> None: """ with self.create_zarr_target() as store_target: zstore = backends.ZarrStore.open_group( - store_target, - mode='w', - cache_array_keys=cache_array_keys) + store_target, mode="w", cache_array_keys=cache_array_keys + ) # ensure that the keys are sorted - array_keys = sorted(('foo', 'bar')) + array_keys = sorted(("foo", "bar")) # create some arrays for ak in array_keys: - zstore.zarr_group.create(name=ak, shape=(1,), dtype='uint8') + zstore.zarr_group.create(name=ak, shape=(1,), dtype="uint8") observed_keys_0 = sorted(zstore.get_array_keys()) assert observed_keys_0 == array_keys # create a new array - new_key = 'baz' - zstore.zarr_group.create(name=new_key, shape=(1,), dtype='uint8') + new_key = "baz" + zstore.zarr_group.create(name=new_key, shape=(1,), dtype="uint8") observed_keys_1 = sorted(zstore.get_array_keys()) if cache_array_keys: @@ -3298,6 +3297,7 @@ def test_get_array_keys(self, cache_array_keys: bool) -> None: else: assert observed_keys_1 == sorted(array_keys + [new_key]) + @requires_zarr @pytest.mark.skipif( KVStore is None, reason="zarr-python 2.x or ZARR_V3_EXPERIMENTAL_API is unset." @@ -6194,7 +6194,6 @@ def test_zarr_region_auto_new_coord_vals(self, tmp_path): ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) def test_zarr_region_index_write(self, tmp_path): - from xarray.backends.zarr import ZarrStore x = np.arange(0, 50, 10) y = np.arange(0, 20, 2) @@ -6562,4 +6561,3 @@ def test_h5netcdf_storage_options() -> None: storage_options={"skip_instance_cache": False}, ) assert_identical(xr.concat([ds1, ds2], dim="time"), ds) - From b3cf2e5e44c36a16fd6e456cd287f473b5e308d5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Dec 2024 15:45:49 +0100 Subject: [PATCH 05/38] lint --- xarray/backends/zarr.py | 14 +++++++------- xarray/tests/test_backends.py | 23 ++++++++++------------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3125fc79686..7072fe1ceb4 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -602,6 +602,8 @@ class ZarrStore(AbstractWritableDataStore): __slots__ = ( "_append_dim", + "_cache", + "_cache_array_keys", "_close_store_on_close", "_consolidate_on_close", "_group", @@ -612,8 +614,6 @@ class ZarrStore(AbstractWritableDataStore): "_use_zarr_fill_value_as_mask", "_write_empty", "_write_region", - "_cache_array_keys", - "_cache", "zarr_group", ) @@ -721,7 +721,7 @@ def open_group( write_empty, close_store_on_close, use_zarr_fill_value_as_mask, - cache_array_keys + cache_array_keys, ) def __init__( @@ -735,7 +735,7 @@ def __init__( write_empty: bool | None = None, close_store_on_close: bool = False, use_zarr_fill_value_as_mask=None, - cache_array_keys: bool = True + cache_array_keys: bool = True, ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -752,7 +752,7 @@ def __init__( self._cache = {} if cache_array_keys: - self._cache['array_keys'] = None + self._cache["array_keys"] = None @property def ds(self): @@ -814,7 +814,7 @@ def get_variables(self): ) def get_array_keys(self) -> tuple[str, ...]: - key = 'array_keys' + key = "array_keys" if key not in self._cache: result = tuple(self.zarr_group.array_keys()) elif self._cache[key] is None: @@ -1529,7 +1529,7 @@ def open_dataset( store=None, engine=None, use_zarr_fill_value_as_mask=None, - cache_array_keys: bool = True + cache_array_keys: bool = True, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b178b56a47d..46ab2bbe8be 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3262,11 +3262,11 @@ def test_chunked_cftime_datetime(self) -> None: assert original[name].chunks == actual_var.chunks assert original.chunks == actual.chunks - @pytest.mark.parametrize('cache_array_keys', [True, False]) + @pytest.mark.parametrize("cache_array_keys", [True, False]) def test_get_array_keys(self, cache_array_keys: bool) -> None: """ - Ensure that if `ZarrStore` is created with `cache_array_keys` set to `True`, - a `ZarrStore.get_array_keys` only invokes the `array_keys` function on the + Ensure that if `ZarrStore` is created with `cache_array_keys` set to `True`, + a `ZarrStore.get_array_keys` only invokes the `array_keys` function on the `ZarrStore.zarr_group` instance once, and that the results of that call are cached. Otherwise, `ZarrStore.get_array_keys` instance should invoke the `array_keys` @@ -3274,23 +3274,22 @@ def test_get_array_keys(self, cache_array_keys: bool) -> None: """ with self.create_zarr_target() as store_target: zstore = backends.ZarrStore.open_group( - store_target, - mode='w', - cache_array_keys=cache_array_keys) + store_target, mode="w", cache_array_keys=cache_array_keys + ) # ensure that the keys are sorted - array_keys = sorted(('foo', 'bar')) + array_keys = sorted(("foo", "bar")) # create some arrays for ak in array_keys: - zstore.zarr_group.create(name=ak, shape=(1,), dtype='uint8') + zstore.zarr_group.create(name=ak, shape=(1,), dtype="uint8") observed_keys_0 = sorted(zstore.get_array_keys()) assert observed_keys_0 == array_keys # create a new array - new_key = 'baz' - zstore.zarr_group.create(name=new_key, shape=(1,), dtype='uint8') + new_key = "baz" + zstore.zarr_group.create(name=new_key, shape=(1,), dtype="uint8") observed_keys_1 = sorted(zstore.get_array_keys()) if cache_array_keys: @@ -3298,6 +3297,7 @@ def test_get_array_keys(self, cache_array_keys: bool) -> None: else: assert observed_keys_1 == sorted(array_keys + [new_key]) + @requires_zarr @pytest.mark.skipif( KVStore is None, reason="zarr-python 2.x or ZARR_V3_EXPERIMENTAL_API is unset." @@ -6194,8 +6194,6 @@ def test_zarr_region_auto_new_coord_vals(self, tmp_path): ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) def test_zarr_region_index_write(self, tmp_path): - from xarray.backends.zarr import ZarrStore - x = np.arange(0, 50, 10) y = np.arange(0, 20, 2) data = np.ones((5, 10)) @@ -6562,4 +6560,3 @@ def test_h5netcdf_storage_options() -> None: storage_options={"skip_instance_cache": False}, ) assert_identical(xr.concat([ds1, ds2], dim="time"), ds) - From bec8b4238bc73bcf37ca091eb241023680ddf776 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Dec 2024 15:58:40 +0100 Subject: [PATCH 06/38] add type annotation for _cache --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7072fe1ceb4..8b83f2b755d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -750,7 +750,7 @@ def __init__( self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask - self._cache = {} + self._cache: dict[str, tuple[str, ...]] = {} if cache_array_keys: self._cache["array_keys"] = None From 61fe759778f7ca3b9425ca3fc1d54221466f69ab Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Dec 2024 16:02:23 +0100 Subject: [PATCH 07/38] make type hint accurate --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 8b83f2b755d..b3d2d3af413 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -750,7 +750,7 @@ def __init__( self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask - self._cache: dict[str, tuple[str, ...]] = {} + self._cache: dict[str, None | tuple[str, ...]] = {} if cache_array_keys: self._cache["array_keys"] = None From 0952c93208269de65ee6ac9a02fae39a6783863f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Dec 2024 16:16:02 +0100 Subject: [PATCH 08/38] make type hint pass mypy --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b3d2d3af413..8cb2d808dd6 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -750,7 +750,7 @@ def __init__( self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask - self._cache: dict[str, None | tuple[str, ...]] = {} + self._cache: dict[str,] = {} if cache_array_keys: self._cache["array_keys"] = None From 96ddcf4230df78905a026f1974a46892471dbd44 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 6 Dec 2024 16:17:49 +0100 Subject: [PATCH 09/38] make type hint pass mypy, correctly --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 8cb2d808dd6..03e96985251 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -750,7 +750,7 @@ def __init__( self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask - self._cache: dict[str,] = {} + self._cache: dict[str, Any] = {} if cache_array_keys: self._cache["array_keys"] = None From 60761a89fc7f63b57e629231ef0698d2828e9a23 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 9 Dec 2024 21:19:01 +0100 Subject: [PATCH 10/38] Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 03e96985251..aa6a8349fa5 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -635,7 +635,7 @@ def open_store( zarr_format=None, use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, - cache_array_keys: bool = True, + cache_array_keys: bool = False, ): ( zarr_group, From 107c4e72b20d1286cdebeaa7f29d2ebb996038c3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 10:15:44 +0100 Subject: [PATCH 11/38] use roundtrip method instead of explicit alternative --- xarray/tests/test_backends.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 46ab2bbe8be..469bc710337 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -409,12 +409,8 @@ def test_zero_dimensional_variable(self) -> None: def test_write_store(self) -> None: expected = create_test_data() - with self.create_store() as store: - expected.dump_to_store(store) - # we need to cf decode the store because it has time and - # non-dimension coordinates - with xr.decode_cf(store) as actual: - assert_allclose(expected, actual) + with self.roundtrip(expected) as actual: + assert_identical(actual, expected) def check_dtypes_roundtripped(self, expected, actual): for k in expected.variables: From 2d3c13a82d31952b32cf71c7120ec490af28816d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 10:24:23 +0100 Subject: [PATCH 12/38] cache members instead of just array keys --- xarray/backends/zarr.py | 74 ++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 03e96985251..3124cf62769 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -9,6 +9,8 @@ import numpy as np import pandas as pd +from zarr import Array as ZarrArray +from zarr import Group as ZarrGroup from xarray import coding, conventions from xarray.backends.common import ( @@ -38,9 +40,6 @@ from xarray.namedarray.utils import module_available if TYPE_CHECKING: - from zarr import Array as ZarrArray - from zarr import Group as ZarrGroup - from xarray.backends.common import AbstractDataStore from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -602,11 +601,11 @@ class ZarrStore(AbstractWritableDataStore): __slots__ = ( "_append_dim", - "_cache", "_cache_array_keys", "_close_store_on_close", "_consolidate_on_close", "_group", + "_members", "_mode", "_read_only", "_safe_chunks", @@ -690,7 +689,7 @@ def open_group( zarr_format=None, use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, - cache_array_keys: bool = True, + cache_members: bool = True, ): ( zarr_group, @@ -721,7 +720,7 @@ def open_group( write_empty, close_store_on_close, use_zarr_fill_value_as_mask, - cache_array_keys, + cache_members, ) def __init__( @@ -735,7 +734,7 @@ def __init__( write_empty: bool | None = None, close_store_on_close: bool = False, use_zarr_fill_value_as_mask=None, - cache_array_keys: bool = True, + cache_members: bool = True, ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -750,9 +749,37 @@ def __init__( self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask - self._cache: dict[str, Any] = {} - if cache_array_keys: - self._cache["array_keys"] = None + self._members: tuple[bool, dict[str, ZarrArray | ZarrGroup] | None] + if cache_members: + self._members = (True, None) + else: + self._members = (False, None) + + @property + def members(self) -> dict[str, ZarrArray | ZarrGroup]: + """ + Model the arrays and groups contained in self.zarr_group as a dict + """ + do_cache, old_members = self._members + if not do_cache or old_members is None: + # we explicitly only care about the arrays, which saves some IO + # in zarr v2 + members = dict(self.zarr_group.arrays()) + if do_cache: + self._members = (do_cache, members) + return members + else: + return old_members + + def array_keys(self) -> tuple[str, ...]: + return tuple(key for (key, _) in self.arrays()) + + def arrays(self) -> tuple[tuple[str, ZarrArray], ...]: + return tuple( + (key, node) + for (key, node) in self.members.items() + if isinstance(node, ZarrArray) + ) @property def ds(self): @@ -761,7 +788,7 @@ def ds(self): def open_store_variable(self, name, zarr_array=None): if zarr_array is None: - zarr_array = self.zarr_group[name] + zarr_array = self.members[name] data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array)) try_nczarr = self._mode == "r" dimensions, attributes = _get_zarr_dims_and_attrs( @@ -809,20 +836,7 @@ def open_store_variable(self, name, zarr_array=None): return Variable(dimensions, data, attributes, encoding) def get_variables(self): - return FrozenDict( - (k, self.open_store_variable(k, v)) for k, v in self.zarr_group.arrays() - ) - - def get_array_keys(self) -> tuple[str, ...]: - key = "array_keys" - if key not in self._cache: - result = tuple(self.zarr_group.array_keys()) - elif self._cache[key] is None: - result = tuple(self.zarr_group.array_keys()) - self._cache[key] = result - else: - result = self._cache[key] - return result + return FrozenDict((k, self.open_store_variable(k, v)) for k, v in self.arrays()) def get_attrs(self): return { @@ -834,7 +848,7 @@ def get_attrs(self): def get_dimensions(self): try_nczarr = self._mode == "r" dimensions = {} - for _k, v in self.zarr_group.arrays(): + for _k, v in self.arrays(): dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr) for d, s in zip(dim_names, v.shape, strict=True): if d in dimensions and dimensions[d] != s: @@ -903,7 +917,7 @@ def store( existing_keys = {} existing_variable_names = {} else: - existing_keys = self.get_array_keys() + existing_keys = self.array_keys() existing_variable_names = { vn for vn in variables if _encode_variable_name(vn) in existing_keys } @@ -1081,7 +1095,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No dimensions. """ - existing_keys = self.get_array_keys() + existing_keys = self.array_keys() is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3 for vn, v in variables.items(): @@ -1273,7 +1287,7 @@ def _validate_and_autodetect_region(self, ds: Dataset) -> Dataset: def _validate_encoding(self, encoding) -> None: if encoding and self._mode in ["a", "a-", "r+"]: - existing_var_names = self.get_array_keys() + existing_var_names = self.array_keys() for var_name in existing_var_names: if var_name in encoding: raise ValueError( @@ -1545,7 +1559,7 @@ def open_dataset( zarr_version=zarr_version, use_zarr_fill_value_as_mask=None, zarr_format=zarr_format, - cache_array_keys=cache_array_keys, + cache_members=cache_array_keys, ) store_entrypoint = StoreBackendEntrypoint() From be5e38972552397c214c8d98301dff857c7faee0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 10:26:10 +0100 Subject: [PATCH 13/38] adjust test to members caching --- xarray/tests/test_backends.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 46ab2bbe8be..f05a53ed2da 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3274,7 +3274,7 @@ def test_get_array_keys(self, cache_array_keys: bool) -> None: """ with self.create_zarr_target() as store_target: zstore = backends.ZarrStore.open_group( - store_target, mode="w", cache_array_keys=cache_array_keys + store_target, mode="w", cache_members=cache_array_keys ) # ensure that the keys are sorted @@ -3284,13 +3284,13 @@ def test_get_array_keys(self, cache_array_keys: bool) -> None: for ak in array_keys: zstore.zarr_group.create(name=ak, shape=(1,), dtype="uint8") - observed_keys_0 = sorted(zstore.get_array_keys()) + observed_keys_0 = sorted(zstore.array_keys()) assert observed_keys_0 == array_keys # create a new array new_key = "baz" zstore.zarr_group.create(name=new_key, shape=(1,), dtype="uint8") - observed_keys_1 = sorted(zstore.get_array_keys()) + observed_keys_1 = sorted(zstore.array_keys()) if cache_array_keys: assert observed_keys_1 == array_keys @@ -3375,11 +3375,11 @@ def test_append(self) -> None: } else: expected = { - "iter": 3, + "iter": 2, "contains": 18, "setitem": 10, "getitem": 13, - "listdir": 2, + "listdir": 1, "list_prefix": 2, } @@ -3477,11 +3477,11 @@ def test_region_write(self) -> None: } else: expected = { - "iter": 2, + "iter": 1, "contains": 4, "setitem": 1, - "getitem": 4, - "listdir": 2, + "getitem": 5, + "listdir": 1, "list_prefix": 0, } From 9d147b90fced2c24568ffaeeadf1b9224df03627 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 12:29:22 +0100 Subject: [PATCH 14/38] refactor members cache --- xarray/backends/zarr.py | 59 +++++++++++++++++++++++------------ xarray/tests/test_backends.py | 50 ++++++++++++++++------------- 2 files changed, 67 insertions(+), 42 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 231834ec1ba..c06bdecebb6 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -601,7 +601,7 @@ class ZarrStore(AbstractWritableDataStore): __slots__ = ( "_append_dim", - "_cache_array_keys", + "_cache_members", "_close_store_on_close", "_consolidate_on_close", "_group", @@ -634,7 +634,7 @@ def open_store( zarr_format=None, use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, - cache_array_keys: bool = False, + cache_members: bool = False, ): ( zarr_group, @@ -666,7 +666,7 @@ def open_store( write_empty, close_store_on_close, use_zarr_fill_value_as_mask, - cache_array_keys=cache_array_keys, + cache_members=cache_members ) for group in group_paths } @@ -748,31 +748,42 @@ def __init__( self._write_empty = write_empty self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask + self._cache_members: bool = cache_members + self._members: dict[str, ZarrArray | ZarrGroup] = {} - self._members: tuple[bool, dict[str, ZarrArray | ZarrGroup] | None] - if cache_members: - self._members = (True, None) - else: - self._members = (False, None) + if self._cache_members: + # initialize the cache + self._members = self._fetch_members() @property - def members(self) -> dict[str, ZarrArray | ZarrGroup]: + def members(self) -> dict[str, ZarrArray]: """ Model the arrays and groups contained in self.zarr_group as a dict """ - do_cache, old_members = self._members - if not do_cache or old_members is None: - # we explicitly only care about the arrays, which saves some IO - # in zarr v2 - members = dict(self.zarr_group.arrays()) - if do_cache: - self._members = (do_cache, members) - return members + if not self._cache_members: + return self._fetch_members() else: - return old_members + return self._members + + def _fetch_members(self) -> dict[str, ZarrArray]: + """ + Get the arrays and groups defined in the zarr group modelled by this Store + """ + return dict(self.zarr_group.items()) + + def _update_members(self, data: dict[str, ZarrArray]): + if not self._cache_members: + msg = ( + 'Updating the members cache is only valid if this object was created ' + 'with cache_members=True, but this object has `cache_members=False`.' + f'You should update the zarr group directly.' + ) + raise ValueError(msg) + else: + self._members = {**self.members, **data} def array_keys(self) -> tuple[str, ...]: - return tuple(key for (key, _) in self.arrays()) + return tuple(key for (key, node) in self.members.items() if isinstance(node, ZarrArray)) def arrays(self) -> tuple[tuple[str, ZarrArray], ...]: return tuple( @@ -1047,6 +1058,10 @@ def _open_existing_array(self, *, name) -> ZarrArray: else: zarr_array = self.zarr_group[name] + # update the model of the underlying zarr group + if self._cache_members: + self._update_members({name: zarr_array}) + self._update_members({name: zarr_array}) return zarr_array def _create_new_array( @@ -1075,6 +1090,9 @@ def _create_new_array( **encoding, ) zarr_array = _put_attrs(zarr_array, attrs) + # update the model of the underlying zarr group + if self._cache_members: + self._update_members({name: zarr_array}) return zarr_array def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=None): @@ -1143,7 +1161,8 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No zarr_array.resize(new_shape) zarr_shape = zarr_array.shape - + # update the model of the members of the zarr group + self.members[name] = zarr_array region = tuple(write_region[dim] for dim in dims) # We need to do this for both new and existing variables to ensure we're not diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6e5d7170553..14756c19bb0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2275,10 +2275,10 @@ def create_zarr_target(self): raise NotImplementedError @contextlib.contextmanager - def create_store(self): + def create_store(self, cache_members: bool = True): with self.create_zarr_target() as store_target: yield backends.ZarrStore.open_group( - store_target, mode="w", **self.version_kwargs + store_target, mode="w", cache_members=cache_members, **self.version_kwargs ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] @@ -2572,7 +2572,7 @@ def test_hidden_zarr_keys(self) -> None: skip_if_zarr_format_3("This test is unnecessary; no hidden Zarr keys") expected = create_test_data() - with self.create_store() as store: + with self.create_store(cache_members=False) as store: expected.dump_to_store(store) zarr_group = store.ds @@ -2594,6 +2594,7 @@ def test_hidden_zarr_keys(self) -> None: # put it back and try removing from a variable del zarr_group["var2"].attrs[self.DIMENSION_KEY] + with pytest.raises(KeyError): with xr.decode_cf(store): pass @@ -3258,19 +3259,18 @@ def test_chunked_cftime_datetime(self) -> None: assert original[name].chunks == actual_var.chunks assert original.chunks == actual.chunks - @pytest.mark.parametrize("cache_array_keys", [True, False]) - def test_get_array_keys(self, cache_array_keys: bool) -> None: + def test_cache_members(self) -> None: """ - Ensure that if `ZarrStore` is created with `cache_array_keys` set to `True`, - a `ZarrStore.get_array_keys` only invokes the `array_keys` function on the - `ZarrStore.zarr_group` instance once, and that the results of that call are cached. + Ensure that if `ZarrStore` is created with `cache_members` set to `True`, + a `ZarrStore` only inspects the underlying zarr group once, + and that the results of that inspection are cached. - Otherwise, `ZarrStore.get_array_keys` instance should invoke the `array_keys` - each time it is called. + Otherwise, `ZarrStore.members` should inspect the underlying zarr group each time it is + invoked """ with self.create_zarr_target() as store_target: - zstore = backends.ZarrStore.open_group( - store_target, mode="w", cache_members=cache_array_keys + zstore_mut = backends.ZarrStore.open_group( + store_target, mode="w", cache_members=False ) # ensure that the keys are sorted @@ -3278,20 +3278,26 @@ def test_get_array_keys(self, cache_array_keys: bool) -> None: # create some arrays for ak in array_keys: - zstore.zarr_group.create(name=ak, shape=(1,), dtype="uint8") + zstore_mut.zarr_group.create(name=ak, shape=(1,), dtype="uint8") + + zstore_stat = backends.ZarrStore.open_group( + store_target, mode="r", cache_members=True + ) - observed_keys_0 = sorted(zstore.array_keys()) + observed_keys_0 = sorted(zstore_stat.array_keys()) assert observed_keys_0 == array_keys # create a new array new_key = "baz" - zstore.zarr_group.create(name=new_key, shape=(1,), dtype="uint8") - observed_keys_1 = sorted(zstore.array_keys()) + zstore_mut.zarr_group.create(name=new_key, shape=(1,), dtype="uint8") + + observed_keys_1 = sorted(zstore_stat.array_keys()) + assert observed_keys_1 == array_keys + + observed_keys_2 = sorted(zstore_mut.array_keys()) + assert observed_keys_2 == sorted(array_keys + [new_key]) + - if cache_array_keys: - assert observed_keys_1 == array_keys - else: - assert observed_keys_1 == sorted(array_keys + [new_key]) @requires_zarr @@ -3556,9 +3562,9 @@ def create_zarr_target(self): yield tmp @contextlib.contextmanager - def create_store(self): + def create_store(self, cache_members: bool = True): with self.create_zarr_target() as store_target: - group = backends.ZarrStore.open_group(store_target, mode="a") + group = backends.ZarrStore.open_group(store_target, mode="a", cache_members=cache_members) yield group From bdb20a6d9a63e4d4bdbe9dff512c16d22197a95f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:58:33 +0000 Subject: [PATCH 15/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 14 ++++++++------ xarray/tests/test_backends.py | 21 ++++++++++++--------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c06bdecebb6..56403d76114 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -666,7 +666,7 @@ def open_store( write_empty, close_store_on_close, use_zarr_fill_value_as_mask, - cache_members=cache_members + cache_members=cache_members, ) for group in group_paths } @@ -774,16 +774,18 @@ def _fetch_members(self) -> dict[str, ZarrArray]: def _update_members(self, data: dict[str, ZarrArray]): if not self._cache_members: msg = ( - 'Updating the members cache is only valid if this object was created ' - 'with cache_members=True, but this object has `cache_members=False`.' - f'You should update the zarr group directly.' - ) + "Updating the members cache is only valid if this object was created " + "with cache_members=True, but this object has `cache_members=False`." + "You should update the zarr group directly." + ) raise ValueError(msg) else: self._members = {**self.members, **data} def array_keys(self) -> tuple[str, ...]: - return tuple(key for (key, node) in self.members.items() if isinstance(node, ZarrArray)) + return tuple( + key for (key, node) in self.members.items() if isinstance(node, ZarrArray) + ) def arrays(self) -> tuple[tuple[str, ZarrArray], ...]: return tuple( diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 14756c19bb0..b638aa70905 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2278,7 +2278,10 @@ def create_zarr_target(self): def create_store(self, cache_members: bool = True): with self.create_zarr_target() as store_target: yield backends.ZarrStore.open_group( - store_target, mode="w", cache_members=cache_members, **self.version_kwargs + store_target, + mode="w", + cache_members=cache_members, + **self.version_kwargs, ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] @@ -2594,7 +2597,7 @@ def test_hidden_zarr_keys(self) -> None: # put it back and try removing from a variable del zarr_group["var2"].attrs[self.DIMENSION_KEY] - + with pytest.raises(KeyError): with xr.decode_cf(store): pass @@ -3262,10 +3265,10 @@ def test_chunked_cftime_datetime(self) -> None: def test_cache_members(self) -> None: """ Ensure that if `ZarrStore` is created with `cache_members` set to `True`, - a `ZarrStore` only inspects the underlying zarr group once, + a `ZarrStore` only inspects the underlying zarr group once, and that the results of that inspection are cached. - Otherwise, `ZarrStore.members` should inspect the underlying zarr group each time it is + Otherwise, `ZarrStore.members` should inspect the underlying zarr group each time it is invoked """ with self.create_zarr_target() as store_target: @@ -3290,16 +3293,14 @@ def test_cache_members(self) -> None: # create a new array new_key = "baz" zstore_mut.zarr_group.create(name=new_key, shape=(1,), dtype="uint8") - + observed_keys_1 = sorted(zstore_stat.array_keys()) assert observed_keys_1 == array_keys - + observed_keys_2 = sorted(zstore_mut.array_keys()) assert observed_keys_2 == sorted(array_keys + [new_key]) - - @requires_zarr @pytest.mark.skipif( KVStore is None, reason="zarr-python 2.x or ZARR_V3_EXPERIMENTAL_API is unset." @@ -3564,7 +3565,9 @@ def create_zarr_target(self): @contextlib.contextmanager def create_store(self, cache_members: bool = True): with self.create_zarr_target() as store_target: - group = backends.ZarrStore.open_group(store_target, mode="a", cache_members=cache_members) + group = backends.ZarrStore.open_group( + store_target, mode="a", cache_members=cache_members + ) yield group From 6578caaa3d0c1c0cb877d45c602e108bc22516f5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 17:59:20 +0100 Subject: [PATCH 16/38] explictly revert changes to test_write_store --- xarray/tests/test_backends.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 14756c19bb0..ea29517ef44 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -407,10 +407,14 @@ def test_zero_dimensional_variable(self) -> None: with self.roundtrip(expected) as actual: assert_identical(expected, actual) - def test_write_store(self) -> None: - expected = create_test_data() - with self.roundtrip(expected) as actual: - assert_identical(actual, expected) +def test_write_store(self) -> None: + expected = create_test_data() + with self.create_store() as store: + expected.dump_to_store(store) + # we need to cf decode the store because it has time and + # non-dimension coordinates + with xr.decode_cf(store) as actual: + assert_allclose(expected, actual) def check_dtypes_roundtripped(self, expected, actual): for k in expected.variables: From df4274f9f2be9d4af3a101c8848df642aa8ddab2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:00:16 +0000 Subject: [PATCH 17/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 08ec6e3f4b2..823af9e2344 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -407,14 +407,14 @@ def test_zero_dimensional_variable(self) -> None: with self.roundtrip(expected) as actual: assert_identical(expected, actual) -def test_write_store(self) -> None: - expected = create_test_data() - with self.create_store() as store: - expected.dump_to_store(store) - # we need to cf decode the store because it has time and - # non-dimension coordinates - with xr.decode_cf(store) as actual: - assert_allclose(expected, actual) +def test_write_store(self) -> None: + expected = create_test_data() + with self.create_store() as store: + expected.dump_to_store(store) + # we need to cf decode the store because it has time and + # non-dimension coordinates + with xr.decode_cf(store) as actual: + assert_allclose(expected, actual) def check_dtypes_roundtripped(self, expected, actual): for k in expected.variables: From 5bf7aff1c33421611e36754cdae9d2d83822e614 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 18:07:27 +0100 Subject: [PATCH 18/38] indent correctly --- xarray/tests/test_backends.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 08ec6e3f4b2..695ed1d28fb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -407,14 +407,14 @@ def test_zero_dimensional_variable(self) -> None: with self.roundtrip(expected) as actual: assert_identical(expected, actual) -def test_write_store(self) -> None: - expected = create_test_data() - with self.create_store() as store: - expected.dump_to_store(store) - # we need to cf decode the store because it has time and - # non-dimension coordinates - with xr.decode_cf(store) as actual: - assert_allclose(expected, actual) + def test_write_store(self) -> None: + expected = create_test_data() + with self.create_store() as store: + expected.dump_to_store(store) + # we need to cf decode the store because it has time and + # non-dimension coordinates + with xr.decode_cf(store) as actual: + assert_allclose(expected, actual) def check_dtypes_roundtripped(self, expected, actual): for k in expected.variables: From 5a818a5e98e8192f4d6aa3746887eb22a3767a7c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 21:18:28 +0100 Subject: [PATCH 19/38] update instrumented tests, remove cache update functionality, and set cache default to False in test fixtures --- xarray/backends/zarr.py | 42 +++++++++++++++-------------------- xarray/tests/test_backends.py | 41 +++++++++++++++++----------------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 56403d76114..6af763ea0c9 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -9,8 +9,6 @@ import numpy as np import pandas as pd -from zarr import Array as ZarrArray -from zarr import Group as ZarrGroup from xarray import coding, conventions from xarray.backends.common import ( @@ -44,7 +42,8 @@ from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree from xarray.core.types import ReadBuffer - + from zarr import Array as ZarrArray + from zarr import Group as ZarrGroup def _get_mappers(*, storage_options, store, chunk_store): # expand str and path-like arguments @@ -736,6 +735,7 @@ def __init__( use_zarr_fill_value_as_mask=None, cache_members: bool = True, ): + self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only self._synchronizer = self.zarr_group.synchronizer @@ -753,12 +753,20 @@ def __init__( if self._cache_members: # initialize the cache + # this cache is created here and never updated. + # If the `ZarrStore` instance creates a new zarr array, or if an external process + # removes an existing zarr array, then the cache will be invalid. + # create a new ZarrStore instance if you want to + # capture the current state of the zarr group, or create a ZarrStore with + # `cache_members` set to `False` to disable this cache and instead fetch members + # on demand. self._members = self._fetch_members() @property def members(self) -> dict[str, ZarrArray]: """ - Model the arrays and groups contained in self.zarr_group as a dict + Model the arrays and groups contained in self.zarr_group as a dict. If `self._cache_members` + is true, the dict is cached. Otherwise, it is retrieved from storage. """ if not self._cache_members: return self._fetch_members() @@ -769,25 +777,20 @@ def _fetch_members(self) -> dict[str, ZarrArray]: """ Get the arrays and groups defined in the zarr group modelled by this Store """ - return dict(self.zarr_group.items()) - - def _update_members(self, data: dict[str, ZarrArray]): - if not self._cache_members: - msg = ( - "Updating the members cache is only valid if this object was created " - "with cache_members=True, but this object has `cache_members=False`." - "You should update the zarr group directly." - ) - raise ValueError(msg) + import zarr + if zarr.__version__ >= "3": + return dict(self.zarr_group.members()) else: - self._members = {**self.members, **data} + return dict(self.zarr_group.items()) def array_keys(self) -> tuple[str, ...]: + from zarr import Array as ZarrArray return tuple( key for (key, node) in self.members.items() if isinstance(node, ZarrArray) ) def arrays(self) -> tuple[tuple[str, ZarrArray], ...]: + from zarr import Array as ZarrArray return tuple( (key, node) for (key, node) in self.members.items() @@ -1060,10 +1063,6 @@ def _open_existing_array(self, *, name) -> ZarrArray: else: zarr_array = self.zarr_group[name] - # update the model of the underlying zarr group - if self._cache_members: - self._update_members({name: zarr_array}) - self._update_members({name: zarr_array}) return zarr_array def _create_new_array( @@ -1092,9 +1091,6 @@ def _create_new_array( **encoding, ) zarr_array = _put_attrs(zarr_array, attrs) - # update the model of the underlying zarr group - if self._cache_members: - self._update_members({name: zarr_array}) return zarr_array def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=None): @@ -1163,8 +1159,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No zarr_array.resize(new_shape) zarr_shape = zarr_array.shape - # update the model of the members of the zarr group - self.members[name] = zarr_array region = tuple(write_region[dim] for dim in dims) # We need to do this for both new and existing variables to ensure we're not diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 695ed1d28fb..2c5d5e4fa0c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2279,7 +2279,7 @@ def create_zarr_target(self): raise NotImplementedError @contextlib.contextmanager - def create_store(self, cache_members: bool = True): + def create_store(self, cache_members: bool = False): with self.create_zarr_target() as store_target: yield backends.ZarrStore.open_group( store_target, @@ -3340,6 +3340,7 @@ def create_zarr_target(self): store = KVStore({}, **kwargs) # type: ignore[arg-type,unused-ignore] yield store + def make_patches(self, store): from unittest.mock import MagicMock @@ -3376,18 +3377,18 @@ def test_append(self) -> None: # TODO: verify these expected = { "set": 5, - "get": 7, - "list_dir": 3, + "get": 4, + "list_dir": 2, "list_prefix": 1, } else: expected = { - "iter": 2, + "iter": 1, "contains": 18, "setitem": 10, "getitem": 13, - "listdir": 1, - "list_prefix": 2, + "listdir": 0, + "list_prefix": 3, } patches = self.make_patches(store) @@ -3407,12 +3408,12 @@ def test_append(self) -> None: } else: expected = { - "iter": 3, - "contains": 9, + "iter": 1, + "contains": 11, "setitem": 6, - "getitem": 13, - "listdir": 2, - "list_prefix": 0, + "getitem": 15, + "listdir": 0, + "list_prefix": 1, } with patch.multiple(KVStore, **patches): @@ -3430,12 +3431,12 @@ def test_append(self) -> None: } else: expected = { - "iter": 3, - "contains": 9, + "iter": 1, + "contains": 11, "setitem": 6, - "getitem": 13, - "listdir": 2, - "list_prefix": 0, + "getitem": 15, + "listdir": 0, + "list_prefix": 1, } with patch.multiple(KVStore, **patches): @@ -3454,8 +3455,8 @@ def test_region_write(self) -> None: if has_zarr_v3: expected = { "set": 5, - "get": 10, - "list_dir": 3, + "get": 2, + "list_dir": 2, "list_prefix": 4, } else: @@ -3463,7 +3464,7 @@ def test_region_write(self) -> None: "iter": 3, "contains": 16, "setitem": 9, - "getitem": 13, + "getitem": 15, "listdir": 2, "list_prefix": 4, } @@ -3567,7 +3568,7 @@ def create_zarr_target(self): yield tmp @contextlib.contextmanager - def create_store(self, cache_members: bool = True): + def create_store(self, cache_members: bool = False): with self.create_zarr_target() as store_target: group = backends.ZarrStore.open_group( store_target, mode="a", cache_members=cache_members From c18f81b8613f2c31de317a91653020f255e735a8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 21:18:33 +0100 Subject: [PATCH 20/38] update instrumented tests, remove cache update functionality, and set cache default to False in test fixtures --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2c5d5e4fa0c..f7b0f413206 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3366,7 +3366,7 @@ def summarize(self, patches): def check_requests(self, expected, patches): summary = self.summarize(patches) for k in summary: - assert summary[k] <= expected[k], (k, summary) + assert summary[k] == expected[k], (k, summary) def test_append(self) -> None: original = Dataset({"foo": ("x", [1])}, coords={"x": [0]}) From 7b13099a4313d449565a0522dbd13016a198fc37 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:19:07 +0000 Subject: [PATCH 21/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 14 +++++++++----- xarray/tests/test_backends.py | 17 ++++++++--------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 6af763ea0c9..6a3448bd942 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -38,12 +38,14 @@ from xarray.namedarray.utils import module_available if TYPE_CHECKING: + from zarr import Array as ZarrArray + from zarr import Group as ZarrGroup + from xarray.backends.common import AbstractDataStore from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree from xarray.core.types import ReadBuffer - from zarr import Array as ZarrArray - from zarr import Group as ZarrGroup + def _get_mappers(*, storage_options, store, chunk_store): # expand str and path-like arguments @@ -735,7 +737,6 @@ def __init__( use_zarr_fill_value_as_mask=None, cache_members: bool = True, ): - self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only self._synchronizer = self.zarr_group.synchronizer @@ -753,10 +754,10 @@ def __init__( if self._cache_members: # initialize the cache - # this cache is created here and never updated. + # this cache is created here and never updated. # If the `ZarrStore` instance creates a new zarr array, or if an external process # removes an existing zarr array, then the cache will be invalid. - # create a new ZarrStore instance if you want to + # create a new ZarrStore instance if you want to # capture the current state of the zarr group, or create a ZarrStore with # `cache_members` set to `False` to disable this cache and instead fetch members # on demand. @@ -778,6 +779,7 @@ def _fetch_members(self) -> dict[str, ZarrArray]: Get the arrays and groups defined in the zarr group modelled by this Store """ import zarr + if zarr.__version__ >= "3": return dict(self.zarr_group.members()) else: @@ -785,12 +787,14 @@ def _fetch_members(self) -> dict[str, ZarrArray]: def array_keys(self) -> tuple[str, ...]: from zarr import Array as ZarrArray + return tuple( key for (key, node) in self.members.items() if isinstance(node, ZarrArray) ) def arrays(self) -> tuple[tuple[str, ZarrArray], ...]: from zarr import Array as ZarrArray + return tuple( (key, node) for (key, node) in self.members.items() diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f7b0f413206..21e59e004ad 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -407,14 +407,14 @@ def test_zero_dimensional_variable(self) -> None: with self.roundtrip(expected) as actual: assert_identical(expected, actual) - def test_write_store(self) -> None: - expected = create_test_data() - with self.create_store() as store: - expected.dump_to_store(store) - # we need to cf decode the store because it has time and - # non-dimension coordinates - with xr.decode_cf(store) as actual: - assert_allclose(expected, actual) + def test_write_store(self) -> None: + expected = create_test_data() + with self.create_store() as store: + expected.dump_to_store(store) + # we need to cf decode the store because it has time and + # non-dimension coordinates + with xr.decode_cf(store) as actual: + assert_allclose(expected, actual) def check_dtypes_roundtripped(self, expected, actual): for k in expected.variables: @@ -3340,7 +3340,6 @@ def create_zarr_target(self): store = KVStore({}, **kwargs) # type: ignore[arg-type,unused-ignore] yield store - def make_patches(self, store): from unittest.mock import MagicMock From 8925020fb0b3980b803ef496dcfa386514933927 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 21:20:48 +0100 Subject: [PATCH 22/38] lint --- xarray/backends/zarr.py | 14 +++++++++----- xarray/tests/test_backends.py | 17 ++++++++--------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 6af763ea0c9..6a3448bd942 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -38,12 +38,14 @@ from xarray.namedarray.utils import module_available if TYPE_CHECKING: + from zarr import Array as ZarrArray + from zarr import Group as ZarrGroup + from xarray.backends.common import AbstractDataStore from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree from xarray.core.types import ReadBuffer - from zarr import Array as ZarrArray - from zarr import Group as ZarrGroup + def _get_mappers(*, storage_options, store, chunk_store): # expand str and path-like arguments @@ -735,7 +737,6 @@ def __init__( use_zarr_fill_value_as_mask=None, cache_members: bool = True, ): - self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only self._synchronizer = self.zarr_group.synchronizer @@ -753,10 +754,10 @@ def __init__( if self._cache_members: # initialize the cache - # this cache is created here and never updated. + # this cache is created here and never updated. # If the `ZarrStore` instance creates a new zarr array, or if an external process # removes an existing zarr array, then the cache will be invalid. - # create a new ZarrStore instance if you want to + # create a new ZarrStore instance if you want to # capture the current state of the zarr group, or create a ZarrStore with # `cache_members` set to `False` to disable this cache and instead fetch members # on demand. @@ -778,6 +779,7 @@ def _fetch_members(self) -> dict[str, ZarrArray]: Get the arrays and groups defined in the zarr group modelled by this Store """ import zarr + if zarr.__version__ >= "3": return dict(self.zarr_group.members()) else: @@ -785,12 +787,14 @@ def _fetch_members(self) -> dict[str, ZarrArray]: def array_keys(self) -> tuple[str, ...]: from zarr import Array as ZarrArray + return tuple( key for (key, node) in self.members.items() if isinstance(node, ZarrArray) ) def arrays(self) -> tuple[tuple[str, ZarrArray], ...]: from zarr import Array as ZarrArray + return tuple( (key, node) for (key, node) in self.members.items() diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f7b0f413206..21e59e004ad 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -407,14 +407,14 @@ def test_zero_dimensional_variable(self) -> None: with self.roundtrip(expected) as actual: assert_identical(expected, actual) - def test_write_store(self) -> None: - expected = create_test_data() - with self.create_store() as store: - expected.dump_to_store(store) - # we need to cf decode the store because it has time and - # non-dimension coordinates - with xr.decode_cf(store) as actual: - assert_allclose(expected, actual) + def test_write_store(self) -> None: + expected = create_test_data() + with self.create_store() as store: + expected.dump_to_store(store) + # we need to cf decode the store because it has time and + # non-dimension coordinates + with xr.decode_cf(store) as actual: + assert_allclose(expected, actual) def check_dtypes_roundtripped(self, expected, actual): for k in expected.variables: @@ -3340,7 +3340,6 @@ def create_zarr_target(self): store = KVStore({}, **kwargs) # type: ignore[arg-type,unused-ignore] yield store - def make_patches(self, store): from unittest.mock import MagicMock From 192fc8b1d029d7805cf01bda372adcdad0fdcc69 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 21:38:51 +0100 Subject: [PATCH 23/38] update tests --- xarray/tests/test_backends.py | 37 +++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 21e59e004ad..cc935c2c879 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3401,8 +3401,8 @@ def test_append(self) -> None: if has_zarr_v3: expected = { "set": 4, - "get": 16, # TODO: fixme upstream (should be 8) - "list_dir": 3, # TODO: fixme upstream (should be 2) + "get": 9, # TODO: fixme upstream (should be 8) + "list_dir": 2, # TODO: fixme upstream (should be 2) "list_prefix": 0, } else: @@ -3424,8 +3424,8 @@ def test_append(self) -> None: if has_zarr_v3: expected = { "set": 4, - "get": 16, # TODO: fixme upstream (should be 8) - "list_dir": 3, # TODO: fixme upstream (should be 2) + "get": 9, # TODO: fixme upstream (should be 8) + "list_dir": 2, # TODO: fixme upstream (should be 2) "list_prefix": 0, } else: @@ -3479,7 +3479,7 @@ def test_region_write(self) -> None: expected = { "set": 1, "get": 3, - "list_dir": 2, + "list_dir": 0, "list_prefix": 0, } else: @@ -3502,8 +3502,8 @@ def test_region_write(self) -> None: if has_zarr_v3: expected = { "set": 1, - "get": 5, - "list_dir": 2, + "get": 4, + "list_dir": 0, "list_prefix": 0, } else: @@ -3525,7 +3525,7 @@ def test_region_write(self) -> None: expected = { "set": 0, "get": 5, - "list_dir": 1, + "list_dir": 0, "list_prefix": 0, } else: @@ -3567,13 +3567,26 @@ def create_zarr_target(self): yield tmp @contextlib.contextmanager - def create_store(self, cache_members: bool = False): + def create_store(self, **kwargs): with self.create_zarr_target() as store_target: - group = backends.ZarrStore.open_group( - store_target, mode="a", cache_members=cache_members - ) + group = backends.ZarrStore.open_group(store_target, mode="a", **kwargs) yield group + def test_write_store(self) -> None: + # This test is overriden from the base implementation because we need to ensure + # that the members cache is off, as the `ZarrStore` instance is re-used in the + # test function. Refactoring the base version of this test to + # if this test is refactored to no longer re-use the store object, then + # this implementation can be removed. + + expected = create_test_data() + with self.create_store(cache_members=False) as store: + expected.dump_to_store(store) + # we need to cf decode the store because it has time and + # non-dimension coordinates + with xr.decode_cf(store) as actual: + assert_allclose(expected, actual) + @requires_zarr class TestZarrWriteEmpty(TestZarrDirectoryStore): From 2ce5b2e49f313f63d5617ff64b1c94e32991ce98 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 21:39:22 +0100 Subject: [PATCH 24/38] make check_requests more permissive --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cc935c2c879..6d725623b56 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3365,7 +3365,7 @@ def summarize(self, patches): def check_requests(self, expected, patches): summary = self.summarize(patches) for k in summary: - assert summary[k] == expected[k], (k, summary) + assert summary[k] <= expected[k], (k, summary) def test_append(self) -> None: original = Dataset({"foo": ("x", [1])}, coords={"x": [0]}) From 87a40c79d4f48e446063a089d62978fb03c617b7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 21:54:58 +0100 Subject: [PATCH 25/38] update instrumented tests for zarr==2.18 --- xarray/tests/test_backends.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6d725623b56..e192d833b61 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3460,12 +3460,12 @@ def test_region_write(self) -> None: } else: expected = { - "iter": 3, + "iter": 1, "contains": 16, "setitem": 9, - "getitem": 15, - "listdir": 2, - "list_prefix": 4, + "getitem": 13, + "listdir": 0, + "list_prefix": 5, } patches = self.make_patches(store) @@ -3485,10 +3485,10 @@ def test_region_write(self) -> None: else: expected = { "iter": 1, - "contains": 4, + "contains": 6, "setitem": 1, - "getitem": 5, - "listdir": 1, + "getitem": 7, + "listdir": 0, "list_prefix": 0, } @@ -3508,11 +3508,11 @@ def test_region_write(self) -> None: } else: expected = { - "iter": 2, - "contains": 4, + "iter": 1, + "contains": 6, "setitem": 1, - "getitem": 6, - "listdir": 2, + "getitem": 8, + "listdir": 0, "list_prefix": 0, } @@ -3530,11 +3530,11 @@ def test_region_write(self) -> None: } else: expected = { - "iter": 2, - "contains": 4, - "setitem": 1, - "getitem": 6, - "listdir": 2, + "iter": 1, + "contains": 6, + "setitem": 0, + "getitem": 8, + "listdir": 0, "list_prefix": 0, } From d98752999817ce574ee1008c71f6bfe08c819f42 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sat, 14 Dec 2024 12:26:49 +0100 Subject: [PATCH 26/38] Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian --- xarray/backends/zarr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 6a3448bd942..0a463201461 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -757,6 +757,7 @@ def __init__( # this cache is created here and never updated. # If the `ZarrStore` instance creates a new zarr array, or if an external process # removes an existing zarr array, then the cache will be invalid. + # We use this cache only to record any pre-existing arrays when the group was opened # create a new ZarrStore instance if you want to # capture the current state of the zarr group, or create a ZarrStore with # `cache_members` set to `False` to disable this cache and instead fetch members From 169b736bad8a3d5e5c6ba13746e84ccb44992e33 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sat, 14 Dec 2024 12:27:18 +0100 Subject: [PATCH 27/38] Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0a463201461..bb5c5fd4535 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1337,7 +1337,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, - cache_array_keys: bool = False, + cache_members: bool = False, **kwargs, ): """Load and decode a dataset from a Zarr store. From e20d11dc4b2d84fd63e6ba8247d4a5b732ad86af Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sat, 14 Dec 2024 12:27:34 +0100 Subject: [PATCH 28/38] Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian --- xarray/backends/zarr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index bb5c5fd4535..ea0f1e09a79 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -807,8 +807,7 @@ def ds(self): # TODO: consider deprecating this in favor of zarr_group return self.zarr_group - def open_store_variable(self, name, zarr_array=None): - if zarr_array is None: + def open_store_variable(self, name): zarr_array = self.members[name] data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array)) try_nczarr = self._mode == "r" From a1c25f2d58af76dc8b100791fcf1e1d60751faf3 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sat, 14 Dec 2024 12:27:45 +0100 Subject: [PATCH 29/38] Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ea0f1e09a79..7f26cd06530 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -856,7 +856,7 @@ def open_store_variable(self, name): return Variable(dimensions, data, attributes, encoding) def get_variables(self): - return FrozenDict((k, self.open_store_variable(k, v)) for k, v in self.arrays()) + return FrozenDict((k, self.open_store_variable(k)) for k in self.array_keys()) def get_attrs(self): return { From f8d78cb8935a049a06cb32a89f75b063900e5dbc Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 16 Dec 2024 17:50:08 +0100 Subject: [PATCH 30/38] Update xarray/backends/zarr.py Co-authored-by: Deepak Cherian --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7f26cd06530..df00d7b79f5 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1336,7 +1336,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, - cache_members: bool = False, + cache_members: bool = True, **kwargs, ): """Load and decode a dataset from a Zarr store. From 650937e8dfa161d0b343516e2a63dbe2b4eb6928 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Dec 2024 11:16:12 +0100 Subject: [PATCH 31/38] doc: add whats new content --- doc/whats-new.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6a08246182c..28fe502df7a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,14 @@ New Features - Better support wrapping additional array types (e.g. ``cupy`` or ``jax``) by calling generalized duck array operations throughout more xarray methods. (:issue:`7848`, :pull:`9798`). By `Sam Levang `_. - +- Better performance for reading Zarr arrays in the ``ZarrStore`` class by caching the state of Zarr +storage and avoiding redundant IO operations. Usage of the cache can be controlled via the +``cache_members`` parameter to ``ZarrStore``. When ``cache_members`` is ``True`` (the default), the +``ZarrStore`` stores a snapshot of names and metadata of the in-scope Zarr arrays; this cache +is then used when iterating over those Zarr arrays, which avoids IO operations and thereby reduces +latency. (:issue:`9853`, :pull:`9861`). By `Davis Bennett `_. + +new parameter, ``cache_members``. Breaking changes ~~~~~~~~~~~~~~~~ From fcf2d646bdadb994c7f13305777eeeea0b747db4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Dec 2024 12:18:33 +0100 Subject: [PATCH 32/38] fixup --- xarray/backends/zarr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index df00d7b79f5..c87dd379c33 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -808,7 +808,7 @@ def ds(self): return self.zarr_group def open_store_variable(self, name): - zarr_array = self.members[name] + zarr_array = self.members[name] data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array)) try_nczarr = self._mode == "r" dimensions, attributes = _get_zarr_dims_and_attrs( @@ -1490,7 +1490,7 @@ def open_zarr( "storage_options": storage_options, "zarr_version": zarr_version, "zarr_format": zarr_format, - "cache_array_keys": cache_array_keys, + "cache_members": cache_members, } ds = open_dataset( From dcecf43167e1a7af03d89f1e930799957dcf0c98 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Dec 2024 13:16:28 +0100 Subject: [PATCH 33/38] update whats new --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 669a3116598..9eda7df7cc7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,7 @@ New Features - Better support wrapping additional array types (e.g. ``cupy`` or ``jax``) by calling generalized duck array operations throughout more xarray methods. (:issue:`7848`, :pull:`9798`). By `Sam Levang `_. + - Better performance for reading Zarr arrays in the ``ZarrStore`` class by caching the state of Zarr storage and avoiding redundant IO operations. Usage of the cache can be controlled via the ``cache_members`` parameter to ``ZarrStore``. When ``cache_members`` is ``True`` (the default), the @@ -31,7 +32,6 @@ storage and avoiding redundant IO operations. Usage of the cache can be controll is then used when iterating over those Zarr arrays, which avoids IO operations and thereby reduces latency. (:issue:`9853`, :pull:`9861`). By `Davis Bennett `_. -new parameter, ``cache_members``. - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). By `Kai Mühlbauer `_. From d0bb5a2508a3d141a785697468e500a8691226b7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Dec 2024 13:51:31 +0100 Subject: [PATCH 34/38] fixup --- doc/whats-new.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9eda7df7cc7..a807a793c71 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,11 +26,11 @@ New Features By `Sam Levang `_. - Better performance for reading Zarr arrays in the ``ZarrStore`` class by caching the state of Zarr -storage and avoiding redundant IO operations. Usage of the cache can be controlled via the -``cache_members`` parameter to ``ZarrStore``. When ``cache_members`` is ``True`` (the default), the -``ZarrStore`` stores a snapshot of names and metadata of the in-scope Zarr arrays; this cache -is then used when iterating over those Zarr arrays, which avoids IO operations and thereby reduces -latency. (:issue:`9853`, :pull:`9861`). By `Davis Bennett `_. + storage and avoiding redundant IO operations. Usage of the cache can be controlled via the + ``cache_members`` parameter to ``ZarrStore``. When ``cache_members`` is ``True`` (the default), the + ``ZarrStore`` stores a snapshot of names and metadata of the in-scope Zarr arrays; this cache + is then used when iterating over those Zarr arrays, which avoids IO operations and thereby reduces + latency. (:issue:`9853`, :pull:`9861`). By `Davis Bennett `_. - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). From 65cadcba8f3ffd67c349c1b4bab913745749a8ea Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Dec 2024 14:05:37 +0100 Subject: [PATCH 35/38] remove vestigial cache_array_keys --- xarray/backends/zarr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ac02a7459da..bb316f00f41 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1560,7 +1560,7 @@ def open_dataset( store=None, engine=None, use_zarr_fill_value_as_mask=None, - cache_array_keys: bool = True, + cache_members: bool = True, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: @@ -1576,7 +1576,7 @@ def open_dataset( zarr_version=zarr_version, use_zarr_fill_value_as_mask=None, zarr_format=zarr_format, - cache_members=cache_array_keys, + cache_members=cache_members, ) store_entrypoint = StoreBackendEntrypoint() From e8cd3c85f749ea97b0c0c85687a3b5f188ba9d32 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Dec 2024 15:13:38 +0100 Subject: [PATCH 36/38] make cache_members default to True --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index bb316f00f41..fafcd939944 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -635,7 +635,7 @@ def open_store( zarr_format=None, use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, - cache_members: bool = False, + cache_members: bool = True, ): ( zarr_group, From 8d31f0ee4a41ec9560e73af482b3c093b22ef39b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Dec 2024 10:00:51 -0700 Subject: [PATCH 37/38] tweak --- xarray/tests/test_backends.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e192d833b61..560090e122c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2579,7 +2579,7 @@ def test_hidden_zarr_keys(self) -> None: skip_if_zarr_format_3("This test is unnecessary; no hidden Zarr keys") expected = create_test_data() - with self.create_store(cache_members=False) as store: + with self.create_store() as store: expected.dump_to_store(store) zarr_group = store.ds @@ -3566,27 +3566,6 @@ def create_zarr_target(self): with create_tmp_file(suffix=".zarr") as tmp: yield tmp - @contextlib.contextmanager - def create_store(self, **kwargs): - with self.create_zarr_target() as store_target: - group = backends.ZarrStore.open_group(store_target, mode="a", **kwargs) - yield group - - def test_write_store(self) -> None: - # This test is overriden from the base implementation because we need to ensure - # that the members cache is off, as the `ZarrStore` instance is re-used in the - # test function. Refactoring the base version of this test to - # if this test is refactored to no longer re-use the store object, then - # this implementation can be removed. - - expected = create_test_data() - with self.create_store(cache_members=False) as store: - expected.dump_to_store(store) - # we need to cf decode the store because it has time and - # non-dimension coordinates - with xr.decode_cf(store) as actual: - assert_allclose(expected, actual) - @requires_zarr class TestZarrWriteEmpty(TestZarrDirectoryStore): From 3d15d3d456e9257a5dc86cb69e1dd0b3d097e804 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Dec 2024 10:08:34 -0700 Subject: [PATCH 38/38] Remove from public API --- xarray/backends/zarr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index fafcd939944..e0a4a042634 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1334,7 +1334,6 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, - cache_members: bool = True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1488,7 +1487,6 @@ def open_zarr( "storage_options": storage_options, "zarr_version": zarr_version, "zarr_format": zarr_format, - "cache_members": cache_members, } ds = open_dataset( pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy