diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e8b602e9dc9..1de857032d0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -12,7 +12,8 @@ v2025.07.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Allow skipping the creation of default indexes when opening datasets (:pull:`8051`). + By `Benoit Bovy `_ and `Justus Magin `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b80ec927b1e..cfd3ff7fc0f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -36,6 +36,7 @@ from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -379,6 +380,15 @@ def _chunk_ds( return backend_ds._replace(variables) +def _maybe_create_default_indexes(ds): + to_index = { + name: coord.variable + for name, coord in ds.coords.items() + if coord.dims == (name,) and name not in ds.xindexes + } + return ds.assign_coords(Coordinates(to_index)) + + def _dataset_from_backend_dataset( backend_ds, filename_or_obj, @@ -389,6 +399,7 @@ def _dataset_from_backend_dataset( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -397,11 +408,15 @@ def _dataset_from_backend_dataset( ) _protect_dataset_variables_inplace(backend_ds, cache) - if chunks is None: - ds = backend_ds + + if create_default_indexes: + ds = _maybe_create_default_indexes(backend_ds) else: + ds = backend_ds + + if chunks is not None: ds = _chunk_ds( - backend_ds, + ds, filename_or_obj, engine, chunks, @@ -434,6 +449,7 @@ def _datatree_from_backend_datatree( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -442,9 +458,11 @@ def _datatree_from_backend_datatree( ) _protect_datatree_variables_inplace(backend_tree, cache) - if chunks is None: - tree = backend_tree + if create_default_indexes: + tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) else: + tree = backend_tree + if chunks is not None: tree = DataTree.from_dict( { path: _chunk_ds( @@ -459,11 +477,12 @@ def _datatree_from_backend_datatree( node=path, **extra_tokens, ) - for path, [node] in group_subtrees(backend_tree) + for path, [node] in group_subtrees(tree) }, - name=backend_tree.name, + name=tree.name, ) + if create_default_indexes or chunks is not None: for path, [node] in group_subtrees(backend_tree): tree[path].set_close(node._close) @@ -497,6 +516,7 @@ def open_dataset( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -610,6 +630,13 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -702,6 +729,7 @@ def open_dataset( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -725,6 +753,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -833,6 +862,13 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -890,6 +926,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, @@ -946,6 +983,7 @@ def open_datatree( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1055,6 +1093,13 @@ def open_datatree( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1148,6 +1193,7 @@ def open_datatree( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -1175,6 +1221,7 @@ def open_groups( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1286,6 +1333,13 @@ def open_groups( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1381,6 +1435,7 @@ def open_groups( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index b1b3956ca8e..de52aa193ed 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -9,6 +9,7 @@ AbstractDataStore, BackendEntrypoint, ) +from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: @@ -36,6 +37,7 @@ def open_dataset( concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -56,8 +58,19 @@ def open_dataset( decode_timedelta=decode_timedelta, ) - ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.intersection(vars)) + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + coord_vars[name] = var + else: + data_vars[name] = var + + # explicit Coordinates object with no index passed + coords = Coordinates(coord_vars, indexes={}) + + ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 48405b906cd..8b26a6b40ec 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1347,6 +1347,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + create_default_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1457,6 +1458,13 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg. Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. Returns ------- @@ -1513,6 +1521,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..a9063c4dcc9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,6 +55,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing +from xarray.core.indexes import PandasIndex from xarray.core.options import set_options from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import module_available @@ -2066,6 +2067,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): with self.roundtrip(original): pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): @@ -4063,6 +4084,26 @@ def test_pickle(self) -> None: def test_pickle_dataarray(self) -> None: pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_scipy class TestScipyFilePath(CFEncodedBase, NetCDF3Only): @@ -6434,6 +6475,26 @@ def test_zarr_closing_internal_zip_store(): assert_identical(original_da, loaded_da) +@requires_zarr +@pytest.mark.parametrize("create_default_indexes", [True, False]) +def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: + from xarray.core.indexes import PandasIndex + + store_path = tmp_path / "tmp.zarr" + original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) + original_ds.to_zarr(store_path, mode="w") + + with open_dataset( + store_path, engine="zarr", create_default_indexes=create_default_indexes + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + + @requires_zarr @pytest.mark.usefixtures("default_zarr_format") def test_raises_key_error_on_invalid_zarr_store(tmp_path): diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 9342423b727..778e800ec67 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -201,3 +201,39 @@ def test_join_chunks(self, shape, pref_chunks, req_chunks): chunks=dict(zip(initial[self.var_name].dims, req_chunks, strict=True)), ) self.check_dataset(initial, final, explicit_chunks(req_chunks, shape)) + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes(self, create_default_indexes): + """Create default indexes if the backend does not create them.""" + coords = xr.Coordinates({"x": ("x", [0, 1]), "y": list("abc")}, indexes={}) + initial = xr.Dataset({"a": ("x", [1, 2])}, coords=coords) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + if create_default_indexes: + assert all(name in final.xindexes for name in ["x", "y"]) + else: + assert len(final.xindexes) == 0 + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes_passthrough(self, create_default_indexes): + """Allow creating indexes in the backend.""" + + initial = xr.Dataset( + {"a": (["x", "y"], [[1, 2, 3], [4, 5, 6]])}, + coords={"x": ("x", [0, 1]), "y": ("y", list("abc"))}, + ).stack(z=["x", "y"]) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + assert initial.coords.equals(final.coords) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy