From c703ebc74f73e3fdc33ea302425bbb689f6204f6 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Aug 2023 12:03:39 +0200 Subject: [PATCH 01/17] add set_indexes parameter to open_dataset --- xarray/backends/api.py | 16 ++++++++++++++++ xarray/backends/common.py | 1 + 2 files changed, 17 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e35d85a1e2f..254ed54a3a0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -403,6 +403,7 @@ def open_dataset( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -492,6 +493,12 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + set_indexes : bool, optional + If True (default), create new indexes from coordinates. Both the number and + the type(s) of those indexes depend on the backend used to open the dataset. + For most common backends this creates a pandas index for each + :term:`Dimension coordinate`, which loads the coordinate data fully in memory. + Set it to False if you want to avoid loading data into memory. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -570,6 +577,7 @@ def open_dataset( backend_ds = backend.open_dataset( filename_or_obj, drop_variables=drop_variables, + set_indexes=set_indexes, **decoders, **kwargs, ) @@ -604,6 +612,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -695,6 +704,12 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + set_indexes : bool, optional + If True (default), create new indexes from coordinates. Both the number and + the type(s) of those indexes depend on the backend used to open the dataset. + For most common backends this creates a pandas index for each + :term:`Dimension coordinate`, which loads the coordinate data fully in memory. + Set it to False if you want to avoid loading data into memory. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -752,6 +767,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + set_indexes=set_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 1ac988c6b4f..f12057bd4af 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -490,6 +490,7 @@ def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, **kwargs: Any, ) -> Dataset: """ From 6f54cd52e8030cf79eac0c4b1b647c08de901131 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Aug 2023 12:04:50 +0200 Subject: [PATCH 02/17] implement set_indexes in (zarr) backend store --- xarray/backends/store.py | 20 ++++++++++++++++++-- xarray/backends/zarr.py | 8 ++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index a507ee37470..e15e6b08c0f 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -9,6 +9,7 @@ AbstractDataStore, BackendEntrypoint, ) +from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: @@ -35,6 +36,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -55,8 +57,22 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti decode_timedelta=decode_timedelta, ) - ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.intersection(vars)) + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + coord_vars[name] = var + else: + data_vars[name] = var + + if set_indexes: + coords = coord_vars + else: + # explict Coordinates object with no index passed + coords = Coordinates(coord_vars) + + ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f88523422bb..2de008b8c72 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -757,6 +757,7 @@ def open_zarr( zarr_version=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + set_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -850,6 +851,10 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + set_indexes : bool, optional + If True (default), create a default (pandas) index for each + :term:`Dimension coordinate`. Set it to False if the dataset contains + dimension coordinate arrays that are too large to load fully in memory. Returns ------- @@ -906,6 +911,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + set_indexes=set_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, @@ -950,6 +956,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, group=None, @@ -986,6 +993,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + set_indexes=set_indexes, ) return ds From 145ae1c44c4d2bdbcf74c9107172e4119a969d30 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:03:21 +0200 Subject: [PATCH 03/17] replace `set_indexes` with `create_default_indexes` --- xarray/backends/api.py | 51 +++++++++++++++++++++++++-------------- xarray/backends/common.py | 2 -- xarray/backends/store.py | 7 ++---- xarray/backends/zarr.py | 17 +++++++------ 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e4fcfe5eeb6..1b5d8e1304b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -36,6 +36,7 @@ from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -389,6 +390,7 @@ def _dataset_from_backend_dataset( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -397,11 +399,22 @@ def _dataset_from_backend_dataset( ) _protect_dataset_variables_inplace(backend_ds, cache) + + if create_default_indexes: + to_index = { + name: coord.variable + for name, coord in backend_ds.coords.items() + if coord.dims == (name,) and name not in backend_ds.xindexes + } + indexed = backend_ds.assign_coords(Coordinates(to_index)) + else: + indexed = backend_ds + if chunks is None: - ds = backend_ds + ds = indexed else: ds = _chunk_ds( - backend_ds, + indexed, filename_or_obj, engine, chunks, @@ -497,7 +510,7 @@ def open_dataset( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -611,12 +624,13 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - set_indexes : bool, optional - If True (default), create new indexes from coordinates. Both the number and - the type(s) of those indexes depend on the backend used to open the dataset. - For most common backends this creates a pandas index for each - :term:`Dimension coordinate`, which loads the coordinate data fully in memory. - Set it to False if you want to avoid loading data into memory. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -695,7 +709,6 @@ def open_dataset( backend_ds = backend.open_dataset( filename_or_obj, drop_variables=drop_variables, - set_indexes=set_indexes, **decoders, **kwargs, ) @@ -710,6 +723,7 @@ def open_dataset( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -733,7 +747,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -842,12 +856,13 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - set_indexes : bool, optional - If True (default), create new indexes from coordinates. Both the number and - the type(s) of those indexes depend on the backend used to open the dataset. - For most common backends this creates a pandas index for each - :term:`Dimension coordinate`, which loads the coordinate data fully in memory. - Set it to False if you want to avoid loading data into memory. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -905,7 +920,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, - set_indexes=set_indexes, + create_default_indexes=create_default_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 9ef0da81659..e1f8dc5cecd 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -697,8 +697,6 @@ def open_dataset( filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, - **kwargs: Any, ) -> Dataset: """ Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. diff --git a/xarray/backends/store.py b/xarray/backends/store.py index 0b34ea7ed8c..7edfbd1c4e0 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -67,11 +67,8 @@ def open_dataset( else: data_vars[name] = var - if set_indexes: - coords = coord_vars - else: - # explicit Coordinates object with no index passed - coords = Coordinates(coord_vars) + # explicit Coordinates object with no index passed + coords = Coordinates(coord_vars) ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f8ff1fa6b7f..dc78194dcd2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1347,7 +1347,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, - set_indexes=True, + create_default_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1458,10 +1458,13 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg. Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. - set_indexes : bool, optional - If True (default), create a default (pandas) index for each - :term:`Dimension coordinate`. Set it to False if the dataset contains - dimension coordinate arrays that are too large to load fully in memory. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. Returns ------- @@ -1518,7 +1521,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, - set_indexes=set_indexes, + create_default_indexes=create_default_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, @@ -1564,7 +1567,6 @@ def open_dataset( concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, - set_indexes: bool = True, use_cftime=None, decode_timedelta=None, group=None, @@ -1608,7 +1610,6 @@ def open_dataset( drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, - set_indexes=set_indexes, ) return ds From 192c367f47c14a1d4e1837e61702eedda07c4a8a Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:20:55 +0200 Subject: [PATCH 04/17] make sure indexes set by the backend survive --- xarray/tests/test_backends_api.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 9342423b727..885b8cb8a46 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -201,3 +201,39 @@ def test_join_chunks(self, shape, pref_chunks, req_chunks): chunks=dict(zip(initial[self.var_name].dims, req_chunks, strict=True)), ) self.check_dataset(initial, final, explicit_chunks(req_chunks, shape)) + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes(self, create_default_indexes): + """Create default indexes if the backend does not create them.""" + coords = xr.Coordinates({"x": ("x", [0, 1]), "y": list("abc")}, indexes={}) + initial = xr.Dataset({"a": ("x", [1, 2])}, coords=coords) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + if create_default_indexes: + assert all(name in final.xindexes for name in ["x", "y"]) + else: + assert not final.xindexes + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes_passthrough(self, create_default_indexes): + """Allow creating indexes in the backend.""" + + initial = xr.Dataset( + {"a": (["x", "y"], [[1, 2, 3], [4, 5, 6]])}, + coords={"x": ("x", [0, 1]), "y": ("y", list("abc"))}, + ).stack(z=["x", "y"]) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + assert initial.coords.equals(final.coords) From f5823a73107643ee1206023cbdb2086f1c375437 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:27:01 +0200 Subject: [PATCH 05/17] also add the parameter to `open_datatree` --- xarray/backends/api.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1b5d8e1304b..e3582370cb4 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1206,6 +1206,7 @@ def open_groups( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1317,6 +1318,13 @@ def open_groups( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1412,6 +1420,7 @@ def open_groups( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) From 2ff8402192a3afd443e155e5a3c10d592475dce7 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Jun 2025 23:37:23 +0200 Subject: [PATCH 06/17] share the implementation of the default indexes creation --- xarray/backends/api.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e3582370cb4..0ac1a4d9503 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -380,6 +380,18 @@ def _chunk_ds( return backend_ds._replace(variables) +def _create_default_indexes(ds, create_default_indexes): + if not create_default_indexes: + return ds + + to_index = { + name: coord.variable + for name, coord in ds.coords.items() + if coord.dims == (name,) and name not in ds.xindexes + } + return ds.assign_coords(Coordinates(to_index)) + + def _dataset_from_backend_dataset( backend_ds, filename_or_obj, @@ -400,15 +412,7 @@ def _dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) - if create_default_indexes: - to_index = { - name: coord.variable - for name, coord in backend_ds.coords.items() - if coord.dims == (name,) and name not in backend_ds.xindexes - } - indexed = backend_ds.assign_coords(Coordinates(to_index)) - else: - indexed = backend_ds + indexed = _create_default_indexes(backend_ds, create_default_indexes) if chunks is None: ds = indexed @@ -447,6 +451,7 @@ def _datatree_from_backend_datatree( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -461,7 +466,7 @@ def _datatree_from_backend_datatree( tree = DataTree.from_dict( { path: _chunk_ds( - node.dataset, + node.dataset.pipe(_create_default_indexes, create_default_indexes), filename_or_obj, engine, chunks, @@ -977,6 +982,7 @@ def open_datatree( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1086,6 +1092,13 @@ def open_datatree( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1179,6 +1192,7 @@ def open_datatree( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) From 294b2f712425ca9424a4d6ad62912d653087beb5 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 2 Jul 2025 00:00:11 +0200 Subject: [PATCH 07/17] check that the store backend entrypoint does not create default indexes --- xarray/tests/test_backends.py | 61 +++++++++++++++++++++++++++++++ xarray/tests/test_backends_api.py | 2 +- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 733188dde1e..92617b5d7b7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,6 +55,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing +from xarray.core.indexes import PandasIndex from xarray.core.options import set_options from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type @@ -2050,6 +2051,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): with self.roundtrip(original): pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): @@ -4009,6 +4030,26 @@ def test_pickle(self) -> None: def test_pickle_dataarray(self) -> None: pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_scipy class TestScipyFilePath(CFEncodedBase, NetCDF3Only): @@ -6378,6 +6419,26 @@ def test_zarr_closing_internal_zip_store(): assert_identical(original_da, loaded_da) +@requires_zarr +@pytest.mark.parametrize("create_default_indexes", [True, False]) +def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: + from xarray.core.indexes import PandasIndex + + store_path = tmp_path / "tmp.zarr" + original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) + original_ds.to_zarr(store_path, mode="w") + + with open_dataset( + store_path, engine="zarr", create_default_indexes=create_default_indexes + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + + @requires_zarr @pytest.mark.usefixtures("default_zarr_format") def test_raises_key_error_on_invalid_zarr_store(tmp_path): diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 885b8cb8a46..778e800ec67 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -218,7 +218,7 @@ def test_default_indexes(self, create_default_indexes): if create_default_indexes: assert all(name in final.xindexes for name in ["x", "y"]) else: - assert not final.xindexes + assert len(final.xindexes) == 0 @pytest.mark.parametrize("create_default_indexes", [True, False]) def test_default_indexes_passthrough(self, create_default_indexes): From 5c3a8437839ef22d2cdd9c907b00e7bfb1c775bd Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 2 Jul 2025 00:01:50 +0200 Subject: [PATCH 08/17] actually do not create default indexes in the backends --- xarray/backends/store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index 7edfbd1c4e0..de52aa193ed 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -68,7 +68,7 @@ def open_dataset( data_vars[name] = var # explicit Coordinates object with no index passed - coords = Coordinates(coord_vars) + coords = Coordinates(coord_vars, indexes={}) ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) From 08939dee3857913711098f1ddce52fc93ad43e47 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 2 Jul 2025 00:02:56 +0200 Subject: [PATCH 09/17] rename the helper --- xarray/backends/api.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0ac1a4d9503..38f96270c99 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -380,7 +380,7 @@ def _chunk_ds( return backend_ds._replace(variables) -def _create_default_indexes(ds, create_default_indexes): +def _maybe_create_default_indexes(ds, create_default_indexes): if not create_default_indexes: return ds @@ -412,7 +412,7 @@ def _dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) - indexed = _create_default_indexes(backend_ds, create_default_indexes) + indexed = _maybe_create_default_indexes(backend_ds, create_default_indexes) if chunks is None: ds = indexed @@ -466,7 +466,9 @@ def _datatree_from_backend_datatree( tree = DataTree.from_dict( { path: _chunk_ds( - node.dataset.pipe(_create_default_indexes, create_default_indexes), + node.dataset.pipe( + _maybe_create_default_indexes, create_default_indexes + ), filename_or_obj, engine, chunks, From 95dbf8e6e787d15f1a649eafa492e9e7e6c6c4e8 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 3 Jul 2025 19:08:37 +0200 Subject: [PATCH 10/17] move the handling of `create_default_indexes` up the call stack --- xarray/backends/api.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 38f96270c99..cb4ef3fa813 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -380,10 +380,7 @@ def _chunk_ds( return backend_ds._replace(variables) -def _maybe_create_default_indexes(ds, create_default_indexes): - if not create_default_indexes: - return ds - +def _maybe_create_default_indexes(ds): to_index = { name: coord.variable for name, coord in ds.coords.items() @@ -412,13 +409,14 @@ def _dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) - indexed = _maybe_create_default_indexes(backend_ds, create_default_indexes) + if create_default_indexes: + backend_ds = _maybe_create_default_indexes(backend_ds) if chunks is None: - ds = indexed + ds = backend_ds else: ds = _chunk_ds( - indexed, + backend_ds, filename_or_obj, engine, chunks, @@ -466,8 +464,10 @@ def _datatree_from_backend_datatree( tree = DataTree.from_dict( { path: _chunk_ds( - node.dataset.pipe( - _maybe_create_default_indexes, create_default_indexes + ( + _maybe_create_default_indexes(node.dataset) + if create_default_indexes + else node.dataset ), filename_or_obj, engine, From d7e6daa7f706234133a4bf160583668622cfec1c Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 3 Jul 2025 19:12:06 +0200 Subject: [PATCH 11/17] what's new --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ad83cfac531..8c0f5cf635c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -12,7 +12,8 @@ v2025.07.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Allow skipping the creation of default indexes when opening datasets (:pull:`8051`). + By `Benoit Bovy `_ and `Justus Magin `_. Breaking changes ~~~~~~~~~~~~~~~~ From 3d483d3f778a426f0be454a3b4461e289a210a4c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 8 Jul 2025 09:26:04 -0700 Subject: [PATCH 12/17] Fix --- xarray/backends/api.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cb4ef3fa813..cb101d7c36b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -410,13 +410,11 @@ def _dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) if create_default_indexes: - backend_ds = _maybe_create_default_indexes(backend_ds) + ds = _maybe_create_default_indexes(backend_ds) - if chunks is None: - ds = backend_ds - else: + if chunks is not None: ds = _chunk_ds( - backend_ds, + ds, filename_or_obj, engine, chunks, From 741564e3823ea0d8c1c06e4d48754f5f9b787964 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 8 Jul 2025 09:31:54 -0700 Subject: [PATCH 13/17] fix again --- xarray/backends/api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cb101d7c36b..0063df35159 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -411,6 +411,8 @@ def _dataset_from_backend_dataset( if create_default_indexes: ds = _maybe_create_default_indexes(backend_ds) + else: + ds = backend_ds if chunks is not None: ds = _chunk_ds( From 8889eda5d3ea6e4c40ee61f280d3b928f7b5efbc Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Tue, 8 Jul 2025 10:47:53 -0700 Subject: [PATCH 14/17] also create default indexes without chunks --- xarray/backends/api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0063df35159..4aecf6b27f0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -459,7 +459,10 @@ def _datatree_from_backend_datatree( _protect_datatree_variables_inplace(backend_tree, cache) if chunks is None: - tree = backend_tree + if create_default_indexes: + tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) + else: + tree = backend_tree else: tree = DataTree.from_dict( { From 804db4cb997a16a617492336a8d7d3914703790d Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Tue, 8 Jul 2025 10:50:10 -0700 Subject: [PATCH 15/17] also copy `_close` --- xarray/backends/api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4aecf6b27f0..40114200318 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -461,6 +461,8 @@ def _datatree_from_backend_datatree( if chunks is None: if create_default_indexes: tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) + for path, [node] in group_subtrees(backend_tree): + tree[path].set_close(node._close) else: tree = backend_tree else: From 75c1dd6e3b8e16c951be292ac5e893598fc0e467 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Tue, 8 Jul 2025 10:51:09 -0700 Subject: [PATCH 16/17] reuse the code for copying `_close` --- xarray/backends/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 40114200318..a9d13adf92e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -461,8 +461,6 @@ def _datatree_from_backend_datatree( if chunks is None: if create_default_indexes: tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) - for path, [node] in group_subtrees(backend_tree): - tree[path].set_close(node._close) else: tree = backend_tree else: @@ -489,6 +487,7 @@ def _datatree_from_backend_datatree( name=backend_tree.name, ) + if create_default_indexes or chunks is not None: for path, [node] in group_subtrees(backend_tree): tree[path].set_close(node._close) From a0d94fbbdaa9c086a465a7e3200b3ba8a6399484 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 8 Jul 2025 11:41:36 -0700 Subject: [PATCH 17/17] refactor --- xarray/backends/api.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a9d13adf92e..cfd3ff7fc0f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -458,20 +458,15 @@ def _datatree_from_backend_datatree( ) _protect_datatree_variables_inplace(backend_tree, cache) - if chunks is None: - if create_default_indexes: - tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) - else: - tree = backend_tree + if create_default_indexes: + tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) else: + tree = backend_tree + if chunks is not None: tree = DataTree.from_dict( { path: _chunk_ds( - ( - _maybe_create_default_indexes(node.dataset) - if create_default_indexes - else node.dataset - ), + node.dataset, filename_or_obj, engine, chunks, @@ -482,9 +477,9 @@ def _datatree_from_backend_datatree( node=path, **extra_tokens, ) - for path, [node] in group_subtrees(backend_tree) + for path, [node] in group_subtrees(tree) }, - name=backend_tree.name, + name=tree.name, ) if create_default_indexes or chunks is not None: pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy