diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index e8d411ec927..b1c1a0828aa 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest env: ASV_DIR: "./asv_bench" - CONDA_ENV_FILE: ci/requirements/environment.yml + CONDA_ENV_FILE: ci/requirements/environment-benchmark.yml steps: # We need the full repo to avoid this issue @@ -29,7 +29,7 @@ jobs: with: micromamba-version: "1.5.10-0" environment-file: ${{env.CONDA_ENV_FILE}} - environment-name: xarray-tests + environment-name: xarray-benchmark cache-environment: true cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 20c873540de..b377542e402 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -60,7 +60,7 @@ // }, "matrix": { "setuptools_scm": [""], // GH6609 - "numpy": [""], + "numpy": ["2.2"], "pandas": [""], "netcdf4": [""], "scipy": [""], diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py index aa4b6cb7df1..68a082fcc4f 100644 --- a/asv_bench/benchmarks/repr.py +++ b/asv_bench/benchmarks/repr.py @@ -57,3 +57,31 @@ def time_repr(self): def time_repr_html(self): self.da._repr_html_() + + +class ReprPandasRangeIndex: + # display a memory-saving pandas.RangeIndex shouldn't trigger memory + # expensive conversion into a numpy array + def setup(self): + index = xr.indexes.PandasIndex(pd.RangeIndex(1_000_000), "x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() + + +class ReprXarrayRangeIndex: + # display an Xarray RangeIndex shouldn't trigger memory expensive conversion + # of its lazy coordinate into a numpy array + def setup(self): + index = xr.indexes.RangeIndex.arange(1_000_000, dim="x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml new file mode 100644 index 00000000000..0e5c7f4b489 --- /dev/null +++ b/ci/requirements/environment-benchmark.yml @@ -0,0 +1,23 @@ +name: xarray-benchmark +channels: + - conda-forge + - nodefaults +dependencies: + - bottleneck + - cftime + - dask-core + - distributed + - flox + - netcdf4 + - numba + - numbagg + - numexpr + - numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105 + - opt_einsum + - packaging + - pandas + - pyarrow # pandas raises a deprecation warning without this, breaking doctests + - sparse + - scipy + - toolz + - zarr diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ad83cfac531..99ddb88e691 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,13 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Refactored the ``PandasIndexingAdapter`` and + ``CoordinateTransformIndexingAdapter`` internal indexing classes. Coordinate + variables that wrap a :py:class:`pandas.RangeIndex`, a + :py:class:`pandas.MultiIndex` or a + :py:class:`xarray.indexes.CoordinateTransform` are now displayed as lazy variables + in the Xarray data reprs (:pull:`10355`). + By `Benoit Bovy `_. .. _whats-new.2025.07.0: diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 7eb0841dc27..3a06cf18542 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -20,7 +20,11 @@ from xarray.core.datatree_render import RenderDataTree from xarray.core.duck_array_ops import array_all, array_any, array_equiv, astype, ravel from xarray.core.extension_array import PandasExtensionArray -from xarray.core.indexing import MemoryCachedArray +from xarray.core.indexing import ( + BasicIndexer, + ExplicitlyIndexed, + MemoryCachedArray, +) from xarray.core.options import OPTIONS, _get_boolean_with_default from xarray.core.treenode import group_subtrees from xarray.core.utils import is_duck_array @@ -87,6 +91,8 @@ def first_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=False) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -111,6 +117,8 @@ def last_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=True) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -659,6 +667,7 @@ def short_array_repr(array): def short_data_repr(array): """Format "data" for DataArray and Variable.""" internal_data = getattr(array, "variable", array)._data + if isinstance(array, np.ndarray): return short_array_repr(array) elif is_duck_array(internal_data): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 35278efdeaf..8e4458fb88f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -9,7 +9,6 @@ from contextlib import suppress from dataclasses import dataclass, field from datetime import timedelta -from html import escape from typing import TYPE_CHECKING, Any, cast, overload import numpy as np @@ -20,7 +19,6 @@ from xarray.core import duck_array_ops from xarray.core.coordinate_transform import CoordinateTransform from xarray.core.nputils import NumpyVIndexAdapter -from xarray.core.options import OPTIONS from xarray.core.types import T_Xarray from xarray.core.utils import ( NDArrayMixin, @@ -1775,10 +1773,25 @@ def __init__( else: self._dtype = np.dtype(cast(DTypeLike, dtype)) + @property + def _in_memory(self) -> bool: + # prevent costly conversion of a memory-saving pd.RangeIndex into a + # large numpy array. + return not isinstance(self.array, pd.RangeIndex) + @property def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] return self._dtype + def _get_numpy_dtype(self, dtype: np.typing.DTypeLike | None = None) -> np.dtype: + if dtype is None: + if is_valid_numpy_dtype(self.dtype): + return cast(np.dtype, self.dtype) + else: + return get_valid_numpy_dtype(self.array) + else: + return np.dtype(dtype) + def __array__( self, dtype: np.typing.DTypeLike | None = None, @@ -1786,11 +1799,9 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None and is_valid_numpy_dtype(self.dtype): - dtype = cast(np.dtype, self.dtype) - else: - dtype = get_valid_numpy_dtype(self.array) + dtype = self._get_numpy_dtype(dtype) array = self.array + if isinstance(array, pd.PeriodIndex): with suppress(AttributeError): # this might not be public API @@ -1834,98 +1845,61 @@ def _convert_scalar(self, item) -> np.ndarray: # numpy fails to convert pd.Timestamp to np.datetime64[ns] item = np.asarray(item.to_datetime64()) elif self.dtype != object: - dtype = self.dtype - if pd.api.types.is_extension_array_dtype(dtype): - dtype = get_valid_numpy_dtype(self.array) - item = np.asarray(item, dtype=cast(np.dtype, dtype)) + dtype = self._get_numpy_dtype() + item = np.asarray(item, dtype=dtype) # as for numpy.ndarray indexing, we always want the result to be # a NumPy array. return to_0d_array(item) - def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: - if isinstance(key, tuple) and len(key) == 1: + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + key = indexer.tuple + + if len(key) == 1: # unpack key so it can index a pandas.Index object (pandas.Index # objects don't like tuples) (key,) = key - return key + # if multidimensional key, convert the index to numpy array and index the latter + if getattr(key, "ndim", 0) > 1: + indexable = NumpyIndexingAdapter(np.asarray(self)) + return getattr(indexable, func_name)(indexer) + + # otherwise index the pandas index then re-wrap or convert the result + result = self.array[key] - def _handle_result( - self, result: Any - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): if isinstance(result, pd.Index): return type(self)(result, dtype=self.dtype) else: return self._convert_scalar(result) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.oindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: + return self._index_get(indexer, "_oindex_get") def _vindex_get( self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: _assert_not_chunked_indexer(indexer.tuple) - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.vindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "_vindex_get") def __getitem__( self, indexer: ExplicitIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable[indexer] - - result = self.array[key] - - return self._handle_result(result) + ) -> PandasIndexingAdapter | np.ndarray: + return self._index_get(indexer, "__getitem__") def transpose(self, order) -> pd.Index: return self.array # self.array should be always one-dimensional + def _repr_inline_(self, max_width: int) -> str: + # we want to display values in the inline repr for lazy coordinates too + # (pd.RangeIndex and pd.MultiIndex). `format_array_flat` prevents loading + # the whole array in memory. + from xarray.core.formatting import format_array_flat + + return format_array_flat(self, max_width) + def __repr__(self) -> str: return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" @@ -1944,7 +1918,9 @@ def copy(self, deep: bool = True) -> Self: def nbytes(self) -> int: if pd.api.types.is_extension_array_dtype(self.dtype): return self.array.nbytes - return cast(np.dtype, self.dtype).itemsize * len(self.array) + + dtype = self._get_numpy_dtype() + return dtype.itemsize * len(self.array) class PandasMultiIndexingAdapter(PandasIndexingAdapter): @@ -1977,8 +1953,8 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None: - dtype = cast(np.dtype, self.dtype) + dtype = self._get_numpy_dtype(dtype) + if self.level is not None: return np.asarray( self.array.get_level_values(self.level).values, dtype=dtype @@ -1986,47 +1962,28 @@ def __array__( else: return super().__array__(dtype, copy=copy) - def _convert_scalar(self, item): + @property + def _in_memory(self) -> bool: + # The pd.MultiIndex's data is fully in memory, but it has a different + # layout than the level and dimension coordinate arrays. Marking this + # adapter class as a "lazy" array will prevent costly conversion when, + # e.g., formatting the Xarray reprs. + return False + + def _convert_scalar(self, item: Any): if isinstance(item, tuple) and self.level is not None: idx = tuple(self.array.names).index(self.level) item = item[idx] return super()._convert_scalar(item) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - result = super()._oindex_get(indexer) - if isinstance(result, type(self)): - result.level = self.level - return result - - def _vindex_get( - self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - result = super()._vindex_get(indexer) + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + result = super()._index_get(indexer, func_name) if isinstance(result, type(self)): result.level = self.level return result - def __getitem__(self, indexer: ExplicitIndexer): - result = super().__getitem__(indexer) - if isinstance(result, type(self)): - result.level = self.level - - return result - def __repr__(self) -> str: if self.level is None: return super().__repr__() @@ -2036,31 +1993,11 @@ def __repr__(self) -> str: ) return f"{type(self).__name__}{props}" - def _get_array_subset(self) -> np.ndarray: - # used to speed-up the repr for big multi-indexes - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)]) - subset = self[OuterIndexer((indices,))] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - from xarray.core.formatting import format_array_flat - if self.level is None: return "MultiIndex" else: - return format_array_flat(self._get_array_subset(), max_width) - - def _repr_html_(self) -> str: - from xarray.core.formatting import short_array_repr - - array_repr = short_array_repr(self._get_array_subset()) - return f"
{escape(array_repr)}
" + return super()._repr_inline_(max_width=max_width) def copy(self, deep: bool = True) -> Self: # see PandasIndexingAdapter.copy @@ -2097,6 +2034,10 @@ def dtype(self) -> np.dtype: def shape(self) -> tuple[int, ...]: return tuple(self._transform.dim_size.values()) + @property + def _in_memory(self) -> bool: + return False + def get_duck_array(self) -> np.ndarray: all_coords = self._transform.generate_coords(dims=self._dims) return np.asarray(all_coords[self._coord_name]) @@ -2157,23 +2098,9 @@ def transpose(self, order: Iterable[int]) -> Self: def __repr__(self: Any) -> str: return f"{type(self).__name__}(transform={self._transform!r})" - def _get_array_subset(self) -> np.ndarray: - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - flat_indices = np.concatenate( - [np.arange(0, pos), np.arange(self.size - pos, self.size)] - ) - subset = self.vindex[ - VectorizedIndexer(np.unravel_index(flat_indices, self.shape)) - ] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - """Good to see some labels even for a lazy coordinate.""" + # we want to display values in the inline repr for this lazy coordinate + # `format_array_flat` prevents loading the whole array in memory. from xarray.core.formatting import format_array_flat - return format_array_flat(self._get_array_subset(), max_width) + return format_array_flat(self, max_width) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 00d97e868c4..bcc2ca4e460 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -23,6 +23,7 @@ from xarray.core.extension_array import PandasExtensionArray from xarray.core.indexing import ( BasicIndexer, + CoordinateTransformIndexingAdapter, OuterIndexer, PandasIndexingAdapter, VectorizedIndexer, @@ -403,10 +404,15 @@ def _new( return cls_(dims_, data, attrs_) @property - def _in_memory(self): + def _in_memory(self) -> bool: + if isinstance( + self._data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter + ): + return self._data._in_memory + return isinstance( self._data, - np.ndarray | np.number | PandasIndexingAdapter | PandasExtensionArray, + np.ndarray | np.number | PandasExtensionArray, ) or ( isinstance(self._data, indexing.MemoryCachedArray) and isinstance(self._data.array, indexing.NumpyIndexingAdapter) diff --git a/xarray/tests/test_coordinate_transform.py b/xarray/tests/test_coordinate_transform.py index 386ce426998..627063eb8cb 100644 --- a/xarray/tests/test_coordinate_transform.py +++ b/xarray/tests/test_coordinate_transform.py @@ -123,6 +123,17 @@ def test_coordinate_transform_variable_repr_inline() -> None: ) +def test_coordinate_transform_variable_repr() -> None: + var = create_coords(scale=2.0, shape=(2, 2))["x"].variable + + actual = repr(var) + expected = """ + Size: 32B +[4 values with dtype=float64] + """.strip() + assert actual == expected + + def test_coordinate_transform_variable_basic_outer_indexing() -> None: var = create_coords(scale=2.0, shape=(4, 4))["x"].variable diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 88c2c819405..c2ab1144e7b 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -1189,3 +1189,46 @@ def test_array_repr_dtypes(): Dimensions without coordinates: x """.strip() assert actual == expected + + +def test_repr_pandas_range_index() -> None: + # lazy data repr but values shown in inline repr + xidx = xr.indexes.PandasIndex(pd.RangeIndex(10), "x") + ds = xr.Dataset(coords=xr.Coordinates.from_xindex(xidx)) + actual = repr(ds.x) + expected = """ + Size: 80B +[10 values with dtype=int64] +Coordinates: + * x (x) int64 80B 0 1 2 3 4 5 6 7 8 9 + """.strip() + assert actual == expected + + +def test_repr_pandas_multi_index() -> None: + # lazy data repr but values shown in inline repr + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["foo", "bar"]) + coords = xr.Coordinates.from_pandas_multiindex(midx, "x") + ds = xr.Dataset(coords=coords) + + actual = repr(ds.x) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected + + actual = repr(ds.foo) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy