from __future__ import annotations import pickle import re import sys import warnings from collections.abc import Hashable from copy import copy, deepcopy from io import StringIO from textwrap import dedent from typing import Any, Literal import numpy as np import pandas as pd import pytest from pandas.core.indexes.datetimes import DatetimeIndex # remove once numpy 2.0 is the oldest supported version try: from numpy.exceptions import RankWarning except ImportError: from numpy import RankWarning # type: ignore[no-redef,attr-defined,unused-ignore] import xarray as xr from xarray import ( DataArray, Dataset, IndexVariable, MergeError, Variable, align, backends, broadcast, open_dataset, set_options, ) from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.coordinates import Coordinates, DatasetCoordinates from xarray.core.indexes import Index, PandasIndex from xarray.core.types import ArrayLike from xarray.core.utils import is_scalar from xarray.groupers import TimeResampler from xarray.namedarray.pycompat import array_type, integer_types from xarray.testing import _assert_internal_invariants from xarray.tests import ( DuckArrayWrapper, InaccessibleArray, UnexpectedDataAccess, assert_allclose, assert_array_equal, assert_equal, assert_identical, assert_no_warnings, assert_writeable, create_test_data, has_cftime, has_dask, raise_if_dask_computes, requires_bottleneck, requires_cftime, requires_cupy, requires_dask, requires_numexpr, requires_pint, requires_scipy, requires_sparse, source_ndarray, ) try: from pandas.errors import UndefinedVariableError except ImportError: # TODO: remove once we stop supporting pandas<1.4.3 from pandas.core.computation.ops import UndefinedVariableError try: import dask.array as da except ImportError: pass # from numpy version 2.0 trapz is deprecated and renamed to trapezoid # remove once numpy 2.0 is the oldest supported version try: from numpy import trapezoid # type: ignore[attr-defined,unused-ignore] except ImportError: from numpy import ( # type: ignore[arg-type,no-redef,attr-defined,unused-ignore] trapz as trapezoid, ) sparse_array_type = array_type("sparse") pytestmark = [ pytest.mark.filterwarnings("error:Mean of empty slice"), pytest.mark.filterwarnings("error:All-NaN (slice|axis) encountered"), ] def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: rs = np.random.default_rng(seed) lat = [2, 1, 0] lon = [0, 1, 2] nt1 = 3 nt2 = 2 time1 = pd.date_range("2000-01-01", periods=nt1).as_unit("ns") time2 = pd.date_range("2000-02-01", periods=nt2).as_unit("ns") string_var = np.array(["a", "bc", "def"], dtype=object) string_var_to_append = np.array(["asdf", "asdfg"], dtype=object) string_var_fixed_length = np.array(["aa", "bb", "cc"], dtype="|S2") string_var_fixed_length_to_append = np.array(["dd", "ee"], dtype="|S2") unicode_var = np.array(["áó", "áó", "áó"]) datetime_var = np.array( ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[ns]" ) datetime_var_to_append = np.array( ["2019-01-04", "2019-01-05"], dtype="datetime64[ns]" ) bool_var = np.array([True, False, True], dtype=bool) bool_var_to_append = np.array([False, True], dtype=bool) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Converting non-default") ds = xr.Dataset( data_vars={ "da": xr.DataArray( rs.random((3, 3, nt1)), coords=[lat, lon, time1], dims=["lat", "lon", "time"], ), "string_var": ("time", string_var), "string_var_fixed_length": ("time", string_var_fixed_length), "unicode_var": ("time", unicode_var), "datetime_var": ("time", datetime_var), "bool_var": ("time", bool_var), } ) ds_to_append = xr.Dataset( data_vars={ "da": xr.DataArray( rs.random((3, 3, nt2)), coords=[lat, lon, time2], dims=["lat", "lon", "time"], ), "string_var": ("time", string_var_to_append), "string_var_fixed_length": ("time", string_var_fixed_length_to_append), "unicode_var": ("time", unicode_var[:nt2]), "datetime_var": ("time", datetime_var_to_append), "bool_var": ("time", bool_var_to_append), } ) ds_with_new_var = xr.Dataset( data_vars={ "new_var": xr.DataArray( rs.random((3, 3, nt1 + nt2)), coords=[lat, lon, time1.append(time2)], dims=["lat", "lon", "time"], ) } ) assert_writeable(ds) assert_writeable(ds_to_append) assert_writeable(ds_with_new_var) return ds, ds_to_append, ds_with_new_var def create_append_string_length_mismatch_test_data(dtype) -> tuple[Dataset, Dataset]: def make_datasets(data, data_to_append) -> tuple[Dataset, Dataset]: ds = xr.Dataset( {"temperature": (["time"], data)}, coords={"time": [0, 1, 2]}, ) ds_to_append = xr.Dataset( {"temperature": (["time"], data_to_append)}, coords={"time": [0, 1, 2]} ) assert_writeable(ds) assert_writeable(ds_to_append) return ds, ds_to_append u2_strings = ["ab", "cd", "ef"] u5_strings = ["abc", "def", "ghijk"] s2_strings = np.array(["aa", "bb", "cc"], dtype="|S2") s3_strings = np.array(["aaa", "bbb", "ccc"], dtype="|S3") if dtype == "U": return make_datasets(u2_strings, u5_strings) elif dtype == "S": return make_datasets(s2_strings, s3_strings) else: raise ValueError(f"unsupported dtype {dtype}.") def create_test_multiindex() -> Dataset: mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) return Dataset({}, Coordinates.from_pandas_multiindex(mindex, "x")) def create_test_stacked_array() -> tuple[DataArray, DataArray]: x = DataArray(pd.Index(np.r_[:10], name="x")) y = DataArray(pd.Index(np.r_[:20], name="y")) a = x * y b = x * y * y return a, b class InaccessibleVariableDataStore(backends.InMemoryDataStore): """ Store that does not allow any data access. """ def __init__(self): super().__init__() self._indexvars = set() def store(self, variables, *args, **kwargs) -> None: super().store(variables, *args, **kwargs) for k, v in variables.items(): if isinstance(v, IndexVariable): self._indexvars.add(k) def get_variables(self): def lazy_inaccessible(k, v): if k in self._indexvars: return v data = indexing.LazilyIndexedArray(InaccessibleArray(v.values)) return Variable(v.dims, data, v.attrs) return {k: lazy_inaccessible(k, v) for k, v in self._variables.items()} class DuckBackendArrayWrapper(backends.common.BackendArray): """Mimic a BackendArray wrapper around DuckArrayWrapper""" def __init__(self, array): self.array = DuckArrayWrapper(array) self.shape = array.shape self.dtype = array.dtype def get_array(self): return self.array def __getitem__(self, key): return self.array[key.tuple] class AccessibleAsDuckArrayDataStore(backends.InMemoryDataStore): """ Store that returns a duck array, not convertible to numpy array, on read. Modeled after nVIDIA's kvikio. """ def __init__(self): super().__init__() self._indexvars = set() def store(self, variables, *args, **kwargs) -> None: super().store(variables, *args, **kwargs) for k, v in variables.items(): if isinstance(v, IndexVariable): self._indexvars.add(k) def get_variables(self) -> dict[Any, xr.Variable]: def lazy_accessible(k, v) -> xr.Variable: if k in self._indexvars: return v data = indexing.LazilyIndexedArray(DuckBackendArrayWrapper(v.values)) return Variable(v.dims, data, v.attrs) return {k: lazy_accessible(k, v) for k, v in self._variables.items()} class TestDataset: def test_repr(self) -> None: data = create_test_data(seed=123) data.attrs["foo"] = "bar" # need to insert str dtype at runtime to handle different endianness expected = dedent( """\ Size: 2kB Dimensions: (dim2: 9, dim3: 10, time: 20, dim1: 8) Coordinates: * dim2 (dim2) float64 72B 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) {} 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' * time (time) datetime64[{}] 160B 2000-01-01 2000-01-02 ... 2000-01-20 numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 Data variables: var1 (dim1, dim2) float64 576B -0.9891 -0.3678 1.288 ... -0.2116 0.364 var2 (dim1, dim2) float64 576B 0.953 1.52 1.704 ... 0.1347 -0.6423 var3 (dim3, dim1) float64 640B 0.4107 0.9941 0.1665 ... 0.716 1.555 Attributes: foo: bar""".format( data["dim3"].dtype, "ns", ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) assert expected == actual with set_options(display_width=100): max_len = max(map(len, repr(data).split("\n"))) assert 90 < max_len < 100 expected = dedent( """\ Size: 0B Dimensions: () Data variables: *empty*""" ) actual = "\n".join(x.rstrip() for x in repr(Dataset()).split("\n")) print(actual) assert expected == actual # verify that ... doesn't appear for scalar coordinates data = Dataset({"foo": ("x", np.ones(10))}).mean() expected = dedent( """\ Size: 8B Dimensions: () Data variables: foo float64 8B 1.0""" ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) print(actual) assert expected == actual # verify long attributes are truncated data = Dataset(attrs={"foo": "bar" * 1000}) assert len(repr(data)) < 1000 def test_repr_multiindex(self) -> None: data = create_test_multiindex() obj_size = np.dtype("O").itemsize expected = dedent( f"""\ Size: {8 * obj_size + 32}B Dimensions: (x: 4) Coordinates: * x (x) object {4 * obj_size}B MultiIndex * level_1 (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2 Data variables: *empty*""" ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) print(actual) assert expected == actual # verify that long level names are not truncated midx = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("a_quite_long_level_name", "level_2") ) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") data = Dataset({}, midx_coords) expected = dedent( f"""\ Size: {8 * obj_size + 32}B Dimensions: (x: 4) Coordinates: * x (x) object {4 * obj_size}B MultiIndex * a_quite_long_level_name (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2 Data variables: *empty*""" ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) print(actual) assert expected == actual def test_repr_period_index(self) -> None: data = create_test_data(seed=456) data.coords["time"] = pd.period_range("2000-01-01", periods=20, freq="D") # check that creating the repr doesn't raise an error #GH645 repr(data) def test_unicode_data(self) -> None: # regression test for GH834 data = Dataset({"foø": ["ba®"]}, attrs={"å": "∑"}) repr(data) # should not raise byteorder = "<" if sys.byteorder == "little" else ">" expected = dedent( f"""\ Size: 12B Dimensions: (foø: 1) Coordinates: * foø (foø) {byteorder}U3 12B {'ba®'!r} Data variables: *empty* Attributes: å: ∑""" ) actual = str(data) assert expected == actual def test_repr_nep18(self) -> None: class Array: def __init__(self): self.shape = (2,) self.ndim = 1 self.dtype = np.dtype(np.float64) def __array_function__(self, *args, **kwargs): return NotImplemented def __array_ufunc__(self, *args, **kwargs): return NotImplemented def __repr__(self): return "Custom\nArray" dataset = Dataset({"foo": ("x", Array())}) expected = dedent( """\ Size: 16B Dimensions: (x: 2) Dimensions without coordinates: x Data variables: foo (x) float64 16B Custom Array""" ) assert expected == repr(dataset) def test_info(self) -> None: ds = create_test_data(seed=123) ds = ds.drop_vars("dim3") # string type prints differently in PY2 vs PY3 ds.attrs["unicode_attr"] = "ba®" ds.attrs["string_attr"] = "bar" buf = StringIO() ds.info(buf=buf) expected = dedent( """\ xarray.Dataset { dimensions: \tdim2 = 9 ; \ttime = 20 ; \tdim1 = 8 ; \tdim3 = 10 ; variables: \tfloat64 dim2(dim2) ; \tdatetime64[ns] time(time) ; \tfloat64 var1(dim1, dim2) ; \t\tvar1:foo = variable ; \tfloat64 var2(dim1, dim2) ; \t\tvar2:foo = variable ; \tfloat64 var3(dim3, dim1) ; \t\tvar3:foo = variable ; \tint64 numbers(dim3) ; // global attributes: \t:unicode_attr = ba® ; \t:string_attr = bar ; }""" ) actual = buf.getvalue() assert expected == actual buf.close() def test_constructor(self) -> None: x1 = ("x", 2 * np.arange(100)) x2 = ("x", np.arange(1000)) z = (["x", "y"], np.arange(1000).reshape(100, 10)) with pytest.raises(ValueError, match=r"conflicting sizes"): Dataset({"a": x1, "b": x2}) with pytest.raises(TypeError, match=r"tuple of form"): Dataset({"x": (1, 2, 3, 4, 5, 6, 7)}) with pytest.raises(ValueError, match=r"already exists as a scalar"): Dataset({"x": 0, "y": ("x", [1, 2, 3])}) # nD coordinate variable "x" sharing name with dimension actual = Dataset({"a": x1, "x": z}) assert "x" not in actual.xindexes _assert_internal_invariants(actual, check_default_indexes=True) # verify handling of DataArrays expected = Dataset({"x": x1, "z": z}) actual = Dataset({"z": expected["z"]}) assert_identical(expected, actual) def test_constructor_1d(self) -> None: expected = Dataset({"x": (["x"], 5.0 + np.arange(5))}) actual = Dataset({"x": 5.0 + np.arange(5)}) assert_identical(expected, actual) actual = Dataset({"x": [5, 6, 7, 8, 9]}) assert_identical(expected, actual) def test_constructor_0d(self) -> None: expected = Dataset({"x": ([], 1)}) for arg in [1, np.array(1), expected["x"]]: actual = Dataset({"x": arg}) assert_identical(expected, actual) class Arbitrary: pass d = pd.Timestamp("2000-01-01T12") args = [ True, None, 3.4, np.nan, "hello", b"raw", np.datetime64("2000-01-01"), d, d.to_pydatetime(), Arbitrary(), ] for arg in args: print(arg) expected = Dataset({"x": ([], arg)}) actual = Dataset({"x": arg}) assert_identical(expected, actual) def test_constructor_auto_align(self) -> None: a = DataArray([1, 2], [("x", [0, 1])]) b = DataArray([3, 4], [("x", [1, 2])]) # verify align uses outer join expected = Dataset( {"a": ("x", [1, 2, np.nan]), "b": ("x", [np.nan, 3, 4])}, {"x": [0, 1, 2]} ) actual = Dataset({"a": a, "b": b}) assert_identical(expected, actual) # regression test for GH346 assert isinstance(actual.variables["x"], IndexVariable) # variable with different dimensions c = ("y", [3, 4]) expected2 = expected.merge({"c": c}) actual = Dataset({"a": a, "b": b, "c": c}) assert_identical(expected2, actual) # variable that is only aligned against the aligned variables d = ("x", [3, 2, 1]) expected3 = expected.merge({"d": d}) actual = Dataset({"a": a, "b": b, "d": d}) assert_identical(expected3, actual) e = ("x", [0, 0]) with pytest.raises(ValueError, match=r"conflicting sizes"): Dataset({"a": a, "b": b, "e": e}) def test_constructor_pandas_sequence(self) -> None: ds = self.make_example_math_dataset() pandas_objs = { var_name: ds[var_name].to_pandas() for var_name in ["foo", "bar"] } ds_based_on_pandas = Dataset(pandas_objs, ds.coords, attrs=ds.attrs) del ds_based_on_pandas["x"] assert_equal(ds, ds_based_on_pandas) # reindex pandas obj, check align works rearranged_index = reversed(pandas_objs["foo"].index) pandas_objs["foo"] = pandas_objs["foo"].reindex(rearranged_index) ds_based_on_pandas = Dataset(pandas_objs, ds.coords, attrs=ds.attrs) del ds_based_on_pandas["x"] assert_equal(ds, ds_based_on_pandas) def test_constructor_pandas_single(self) -> None: das = [ DataArray(np.random.rand(4), dims=["a"]), # series DataArray(np.random.rand(4, 3), dims=["a", "b"]), # df ] for a in das: pandas_obj = a.to_pandas() ds_based_on_pandas = Dataset(pandas_obj) # type: ignore[arg-type] # TODO: improve typing of __init__ for dim in ds_based_on_pandas.data_vars: assert isinstance(dim, int) assert_array_equal(ds_based_on_pandas[dim], pandas_obj[dim]) def test_constructor_compat(self) -> None: data = {"x": DataArray(0, coords={"y": 1}), "y": ("z", [1, 1, 1])} expected = Dataset({"x": 0}, {"y": ("z", [1, 1, 1])}) actual = Dataset(data) assert_identical(expected, actual) data = {"y": ("z", [1, 1, 1]), "x": DataArray(0, coords={"y": 1})} actual = Dataset(data) assert_identical(expected, actual) original = Dataset( {"a": (("x", "y"), np.ones((2, 3)))}, {"c": (("x", "y"), np.zeros((2, 3))), "x": [0, 1]}, ) expected = Dataset( {"a": ("x", np.ones(2)), "b": ("y", np.ones(3))}, {"c": (("x", "y"), np.zeros((2, 3))), "x": [0, 1]}, ) actual = Dataset( {"a": original["a"][:, 0], "b": original["a"][0].drop_vars("x")} ) assert_identical(expected, actual) data = {"x": DataArray(0, coords={"y": 3}), "y": ("z", [1, 1, 1])} with pytest.raises(MergeError): Dataset(data) data = {"x": DataArray(0, coords={"y": 1}), "y": [1, 1]} actual = Dataset(data) expected = Dataset({"x": 0}, {"y": [1, 1]}) assert_identical(expected, actual) def test_constructor_with_coords(self) -> None: with pytest.raises(ValueError, match=r"found in both data_vars and"): Dataset({"a": ("x", [1])}, {"a": ("x", [1])}) ds = Dataset({}, {"a": ("x", [1])}) assert not ds.data_vars assert list(ds.coords.keys()) == ["a"] mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) with pytest.raises(ValueError, match=r"conflicting MultiIndex"): with pytest.warns( FutureWarning, match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): Dataset({}, {"x": mindex, "y": mindex}) Dataset({}, {"x": mindex, "level_1": range(4)}) def test_constructor_no_default_index(self) -> None: # explicitly passing a Coordinates object skips the creation of default index ds = Dataset(coords=Coordinates({"x": [1, 2, 3]}, indexes={})) assert "x" in ds assert "x" not in ds.xindexes def test_constructor_multiindex(self) -> None: midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) coords = Coordinates.from_pandas_multiindex(midx, "x") ds = Dataset(coords=coords) assert_identical(ds, coords.to_dataset()) with pytest.warns( FutureWarning, match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): Dataset(data_vars={"x": midx}) with pytest.warns( FutureWarning, match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): Dataset(coords={"x": midx}) def test_constructor_custom_index(self) -> None: class CustomIndex(Index): ... coords = Coordinates( coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} ) ds = Dataset(coords=coords) assert isinstance(ds.xindexes["x"], CustomIndex) # test coordinate variables copied assert ds.variables["x"] is not coords.variables["x"] @pytest.mark.filterwarnings("ignore:return type") def test_properties(self) -> None: ds = create_test_data() # dims / sizes # These exact types aren't public API, but this makes sure we don't # change them inadvertently: assert isinstance(ds.dims, utils.Frozen) # TODO change after deprecation cycle in GH #8500 is complete assert isinstance(ds.dims.mapping, dict) assert type(ds.dims.mapping) is dict with pytest.warns( FutureWarning, match=" To access a mapping from dimension names to lengths, please use `Dataset.sizes`", ): assert ds.dims == ds.sizes assert ds.sizes == {"dim1": 8, "dim2": 9, "dim3": 10, "time": 20} # dtypes assert isinstance(ds.dtypes, utils.Frozen) assert isinstance(ds.dtypes.mapping, dict) assert ds.dtypes == { "var1": np.dtype("float64"), "var2": np.dtype("float64"), "var3": np.dtype("float64"), } # data_vars assert list(ds) == list(ds.data_vars) assert list(ds.keys()) == list(ds.data_vars) assert "aasldfjalskdfj" not in ds.variables assert "dim1" in repr(ds.variables) assert len(ds) == 3 assert bool(ds) assert list(ds.data_vars) == ["var1", "var2", "var3"] assert list(ds.data_vars.keys()) == ["var1", "var2", "var3"] assert "var1" in ds.data_vars assert "dim1" not in ds.data_vars assert "numbers" not in ds.data_vars assert len(ds.data_vars) == 3 # xindexes assert set(ds.xindexes) == {"dim2", "dim3", "time"} assert len(ds.xindexes) == 3 assert "dim2" in repr(ds.xindexes) assert all(isinstance(idx, Index) for idx in ds.xindexes.values()) # indexes assert set(ds.indexes) == {"dim2", "dim3", "time"} assert len(ds.indexes) == 3 assert "dim2" in repr(ds.indexes) assert all(isinstance(idx, pd.Index) for idx in ds.indexes.values()) # coords assert list(ds.coords) == ["dim2", "dim3", "time", "numbers"] assert "dim2" in ds.coords assert "numbers" in ds.coords assert "var1" not in ds.coords assert "dim1" not in ds.coords assert len(ds.coords) == 4 # nbytes assert ( Dataset({"x": np.int64(1), "y": np.array([1, 2], dtype=np.float32)}).nbytes == 16 ) def test_warn_ds_dims_deprecation(self) -> None: # TODO remove after deprecation cycle in GH #8500 is complete ds = create_test_data() with pytest.warns(FutureWarning, match="return type"): ds.dims["dim1"] with pytest.warns(FutureWarning, match="return type"): ds.dims.keys() with pytest.warns(FutureWarning, match="return type"): ds.dims.values() with pytest.warns(FutureWarning, match="return type"): ds.dims.items() with assert_no_warnings(): len(ds.dims) ds.dims.__iter__() _ = "dim1" in ds.dims def test_asarray(self) -> None: ds = Dataset({"x": 0}) with pytest.raises(TypeError, match=r"cannot directly convert"): np.asarray(ds) def test_get_index(self) -> None: ds = Dataset({"foo": (("x", "y"), np.zeros((2, 3)))}, coords={"x": ["a", "b"]}) assert ds.get_index("x").equals(pd.Index(["a", "b"])) assert ds.get_index("y").equals(pd.Index([0, 1, 2])) with pytest.raises(KeyError): ds.get_index("z") def test_attr_access(self) -> None: ds = Dataset( {"tmin": ("x", [42], {"units": "Celsius"})}, attrs={"title": "My test data"} ) assert_identical(ds.tmin, ds["tmin"]) assert_identical(ds.tmin.x, ds.x) assert ds.title == ds.attrs["title"] assert ds.tmin.units == ds["tmin"].attrs["units"] assert {"tmin", "title"} <= set(dir(ds)) assert "units" in set(dir(ds.tmin)) # should defer to variable of same name ds.attrs["tmin"] = -999 assert ds.attrs["tmin"] == -999 assert_identical(ds.tmin, ds["tmin"]) def test_variable(self) -> None: a = Dataset() d = np.random.random((10, 3)) a["foo"] = (("time", "x"), d) assert "foo" in a.variables assert "foo" in a a["bar"] = (("time", "x"), d) # order of creation is preserved assert list(a.variables) == ["foo", "bar"] assert_array_equal(a["foo"].values, d) # try to add variable with dim (10,3) with data that's (3,10) with pytest.raises(ValueError): a["qux"] = (("time", "x"), d.T) def test_modify_inplace(self) -> None: a = Dataset() vec = np.random.random((10,)) attributes = {"foo": "bar"} a["x"] = ("x", vec, attributes) assert "x" in a.coords assert isinstance(a.coords["x"].to_index(), pd.Index) assert_identical(a.coords["x"].variable, a.variables["x"]) b = Dataset() b["x"] = ("x", vec, attributes) assert_identical(a["x"], b["x"]) assert a.sizes == b.sizes # this should work a["x"] = ("x", vec[:5]) a["z"] = ("x", np.arange(5)) with pytest.raises(ValueError): # now it shouldn't, since there is a conflicting length a["x"] = ("x", vec[:4]) arr = np.random.random((10, 1)) scal = np.array(0) with pytest.raises(ValueError): a["y"] = ("y", arr) with pytest.raises(ValueError): a["y"] = ("y", scal) assert "y" not in a.dims def test_coords_properties(self) -> None: # use int64 for repr consistency on windows data = Dataset( { "x": ("x", np.array([-1, -2], "int64")), "y": ("y", np.array([0, 1, 2], "int64")), "foo": (["x", "y"], np.random.randn(2, 3)), }, {"a": ("x", np.array([4, 5], "int64")), "b": np.int64(-10)}, ) coords = data.coords assert isinstance(coords, DatasetCoordinates) # len assert len(coords) == 4 # iter assert list(coords) == ["x", "y", "a", "b"] assert_identical(coords["x"].variable, data["x"].variable) assert_identical(coords["y"].variable, data["y"].variable) assert "x" in coords assert "a" in coords assert 0 not in coords assert "foo" not in coords with pytest.raises(KeyError): coords["foo"] with pytest.raises(KeyError): coords[0] # repr expected = dedent( """\ Coordinates: * x (x) int64 16B -1 -2 * y (y) int64 24B 0 1 2 a (x) int64 16B 4 5 b int64 8B -10""" ) actual = repr(coords) assert expected == actual # dims assert coords.sizes == {"x": 2, "y": 3} # dtypes assert coords.dtypes == { "x": np.dtype("int64"), "y": np.dtype("int64"), "a": np.dtype("int64"), "b": np.dtype("int64"), } def test_coords_modify(self) -> None: data = Dataset( { "x": ("x", [-1, -2]), "y": ("y", [0, 1, 2]), "foo": (["x", "y"], np.random.randn(2, 3)), }, {"a": ("x", [4, 5]), "b": -10}, ) actual = data.copy(deep=True) actual.coords["x"] = ("x", ["a", "b"]) assert_array_equal(actual["x"], ["a", "b"]) actual = data.copy(deep=True) actual.coords["z"] = ("z", ["a", "b"]) assert_array_equal(actual["z"], ["a", "b"]) actual = data.copy(deep=True) with pytest.raises(ValueError, match=r"conflicting dimension sizes"): actual.coords["x"] = ("x", [-1]) assert_identical(actual, data) # should not be modified actual = data.copy() del actual.coords["b"] expected = data.reset_coords("b", drop=True) assert_identical(expected, actual) with pytest.raises(KeyError): del data.coords["not_found"] with pytest.raises(KeyError): del data.coords["foo"] actual = data.copy(deep=True) actual.coords.update({"c": 11}) expected = data.merge({"c": 11}).set_coords("c") assert_identical(expected, actual) # regression test for GH3746 del actual.coords["x"] assert "x" not in actual.xindexes def test_update_index(self) -> None: actual = Dataset(coords={"x": [1, 2, 3]}) actual["x"] = ["a", "b", "c"] assert actual.xindexes["x"].to_pandas_index().equals(pd.Index(["a", "b", "c"])) def test_coords_setitem_with_new_dimension(self) -> None: actual = Dataset() actual.coords["foo"] = ("x", [1, 2, 3]) expected = Dataset(coords={"foo": ("x", [1, 2, 3])}) assert_identical(expected, actual) def test_coords_setitem_multiindex(self) -> None: data = create_test_multiindex() with pytest.raises(ValueError, match=r"cannot drop or update.*corrupt.*index "): data.coords["level_1"] = range(4) def test_coords_set(self) -> None: one_coord = Dataset({"x": ("x", [0]), "yy": ("x", [1]), "zzz": ("x", [2])}) two_coords = Dataset({"zzz": ("x", [2])}, {"x": ("x", [0]), "yy": ("x", [1])}) all_coords = Dataset( coords={"x": ("x", [0]), "yy": ("x", [1]), "zzz": ("x", [2])} ) actual = one_coord.set_coords("x") assert_identical(one_coord, actual) actual = one_coord.set_coords(["x"]) assert_identical(one_coord, actual) actual = one_coord.set_coords("yy") assert_identical(two_coords, actual) actual = one_coord.set_coords(["yy", "zzz"]) assert_identical(all_coords, actual) actual = one_coord.reset_coords() assert_identical(one_coord, actual) actual = two_coords.reset_coords() assert_identical(one_coord, actual) actual = all_coords.reset_coords() assert_identical(one_coord, actual) actual = all_coords.reset_coords(["yy", "zzz"]) assert_identical(one_coord, actual) actual = all_coords.reset_coords("zzz") assert_identical(two_coords, actual) with pytest.raises(ValueError, match=r"cannot remove index"): one_coord.reset_coords("x") actual = all_coords.reset_coords("zzz", drop=True) expected = all_coords.drop_vars("zzz") assert_identical(expected, actual) expected = two_coords.drop_vars("zzz") assert_identical(expected, actual) def test_coords_to_dataset(self) -> None: orig = Dataset({"foo": ("y", [-1, 0, 1])}, {"x": 10, "y": [2, 3, 4]}) expected = Dataset(coords={"x": 10, "y": [2, 3, 4]}) actual = orig.coords.to_dataset() assert_identical(expected, actual) def test_coords_merge(self) -> None: orig_coords = Dataset(coords={"a": ("x", [1, 2]), "x": [0, 1]}).coords other_coords = Dataset(coords={"b": ("x", ["a", "b"]), "x": [0, 1]}).coords expected = Dataset( coords={"a": ("x", [1, 2]), "b": ("x", ["a", "b"]), "x": [0, 1]} ) actual = orig_coords.merge(other_coords) assert_identical(expected, actual) actual = other_coords.merge(orig_coords) assert_identical(expected, actual) other_coords = Dataset(coords={"x": ("x", ["a"])}).coords with pytest.raises(MergeError): orig_coords.merge(other_coords) other_coords = Dataset(coords={"x": ("x", ["a", "b"])}).coords with pytest.raises(MergeError): orig_coords.merge(other_coords) other_coords = Dataset(coords={"x": ("x", ["a", "b", "c"])}).coords with pytest.raises(MergeError): orig_coords.merge(other_coords) other_coords = Dataset(coords={"a": ("x", [8, 9])}).coords expected = Dataset(coords={"x": range(2)}) actual = orig_coords.merge(other_coords) assert_identical(expected, actual) actual = other_coords.merge(orig_coords) assert_identical(expected, actual) other_coords = Dataset(coords={"x": np.nan}).coords actual = orig_coords.merge(other_coords) assert_identical(orig_coords.to_dataset(), actual) actual = other_coords.merge(orig_coords) assert_identical(orig_coords.to_dataset(), actual) def test_coords_merge_mismatched_shape(self) -> None: orig_coords = Dataset(coords={"a": ("x", [1, 1])}).coords other_coords = Dataset(coords={"a": 1}).coords expected = orig_coords.to_dataset() actual = orig_coords.merge(other_coords) assert_identical(expected, actual) other_coords = Dataset(coords={"a": ("y", [1])}).coords expected = Dataset(coords={"a": (["x", "y"], [[1], [1]])}) actual = orig_coords.merge(other_coords) assert_identical(expected, actual) actual = other_coords.merge(orig_coords) assert_identical(expected.transpose(), actual) orig_coords = Dataset(coords={"a": ("x", [np.nan])}).coords other_coords = Dataset(coords={"a": np.nan}).coords expected = orig_coords.to_dataset() actual = orig_coords.merge(other_coords) assert_identical(expected, actual) def test_data_vars_properties(self) -> None: ds = Dataset() ds["foo"] = (("x",), [1.0]) ds["bar"] = 2.0 # iter assert set(ds.data_vars) == {"foo", "bar"} assert "foo" in ds.data_vars assert "x" not in ds.data_vars assert_identical(ds["foo"], ds.data_vars["foo"]) # repr expected = dedent( """\ Data variables: foo (x) float64 8B 1.0 bar float64 8B 2.0""" ) actual = repr(ds.data_vars) assert expected == actual # dtypes assert ds.data_vars.dtypes == { "foo": np.dtype("float64"), "bar": np.dtype("float64"), } # len ds.coords["x"] = [1] assert len(ds.data_vars) == 2 # https://github.com/pydata/xarray/issues/7588 with pytest.raises( AssertionError, match="something is wrong with Dataset._coord_names" ): ds._coord_names = {"w", "x", "y", "z"} len(ds.data_vars) def test_equals_and_identical(self) -> None: data = create_test_data(seed=42) assert data.equals(data) assert data.identical(data) data2 = create_test_data(seed=42) data2.attrs["foobar"] = "baz" assert data.equals(data2) assert not data.identical(data2) del data2["time"] assert not data.equals(data2) data = create_test_data(seed=42).rename({"var1": None}) assert data.equals(data) assert data.identical(data) data2 = data.reset_coords() assert not data2.equals(data) assert not data2.identical(data) def test_equals_failures(self) -> None: data = create_test_data() assert not data.equals("foo") # type: ignore[arg-type] assert not data.identical(123) # type: ignore[arg-type] assert not data.broadcast_equals({1: 2}) # type: ignore[arg-type] def test_broadcast_equals(self) -> None: data1 = Dataset(coords={"x": 0}) data2 = Dataset(coords={"x": [0]}) assert data1.broadcast_equals(data2) assert not data1.equals(data2) assert not data1.identical(data2) def test_attrs(self) -> None: data = create_test_data(seed=42) data.attrs = {"foobar": "baz"} assert data.attrs["foobar"], "baz" assert isinstance(data.attrs, dict) def test_chunks_does_not_load_data(self) -> None: # regression test for GH6538 store = InaccessibleVariableDataStore() create_test_data().dump_to_store(store) ds = open_dataset(store) assert ds.chunks == {} @requires_dask def test_chunk(self) -> None: data = create_test_data() for v in data.variables.values(): assert isinstance(v.data, np.ndarray) assert data.chunks == {} reblocked = data.chunk() for k, v in reblocked.variables.items(): if k in reblocked.dims: assert isinstance(v.data, np.ndarray) else: assert isinstance(v.data, da.Array) expected_chunks: dict[Hashable, tuple[int, ...]] = { "dim1": (8,), "dim2": (9,), "dim3": (10,), } assert reblocked.chunks == expected_chunks # test kwargs form of chunks assert data.chunk(expected_chunks).chunks == expected_chunks def get_dask_names(ds): return {k: v.data.name for k, v in ds.items()} orig_dask_names = get_dask_names(reblocked) reblocked = data.chunk({"time": 5, "dim1": 5, "dim2": 5, "dim3": 5}) # time is not a dim in any of the data_vars, so it # doesn't get chunked expected_chunks = {"dim1": (5, 3), "dim2": (5, 4), "dim3": (5, 5)} assert reblocked.chunks == expected_chunks # make sure dask names change when rechunking by different amounts # regression test for GH3350 new_dask_names = get_dask_names(reblocked) for k, v in new_dask_names.items(): assert v != orig_dask_names[k] reblocked = data.chunk(expected_chunks) assert reblocked.chunks == expected_chunks # reblock on already blocked data orig_dask_names = get_dask_names(reblocked) reblocked = reblocked.chunk(expected_chunks) new_dask_names = get_dask_names(reblocked) assert reblocked.chunks == expected_chunks assert_identical(reblocked, data) # rechunking with same chunk sizes should not change names for k, v in new_dask_names.items(): assert v == orig_dask_names[k] with pytest.raises( ValueError, match=re.escape( "chunks keys ('foo',) not found in data dimensions ('dim2', 'dim3', 'time', 'dim1')" ), ): data.chunk({"foo": 10}) @requires_dask @pytest.mark.parametrize( "calendar", ( "standard", pytest.param( "gregorian", marks=pytest.mark.skipif(not has_cftime, reason="needs cftime"), ), ), ) @pytest.mark.parametrize("freq", ["D", "W", "5ME", "YE"]) @pytest.mark.parametrize("add_gap", [True, False]) def test_chunk_by_frequency(self, freq: str, calendar: str, add_gap: bool) -> None: import dask.array N = 365 * 2 ΔN = 28 time = xr.date_range( "2001-01-01", periods=N + ΔN, freq="D", calendar=calendar ).to_numpy(copy=True) if add_gap: # introduce an empty bin time[31 : 31 + ΔN] = np.datetime64("NaT") time = time[~np.isnat(time)] else: time = time[:N] ds = Dataset( { "pr": ("time", dask.array.random.random((N), chunks=(20))), "pr2d": (("x", "time"), dask.array.random.random((10, N), chunks=(20))), "ones": ("time", np.ones((N,))), }, coords={"time": time}, ) rechunked = ds.chunk(x=2, time=TimeResampler(freq)) expected = tuple( ds.ones.resample(time=freq).sum().dropna("time").astype(int).data.tolist() ) assert rechunked.chunksizes["time"] == expected assert rechunked.chunksizes["x"] == (2,) * 5 rechunked = ds.chunk({"x": 2, "time": TimeResampler(freq)}) assert rechunked.chunksizes["time"] == expected assert rechunked.chunksizes["x"] == (2,) * 5 def test_chunk_by_frequecy_errors(self): ds = Dataset({"foo": ("x", [1, 2, 3])}) with pytest.raises(ValueError, match="virtual variable"): ds.chunk(x=TimeResampler("YE")) ds["x"] = ("x", [1, 2, 3]) with pytest.raises(ValueError, match="datetime variables"): ds.chunk(x=TimeResampler("YE")) ds["x"] = ("x", xr.date_range("2001-01-01", periods=3, freq="D")) with pytest.raises(ValueError, match="Invalid frequency"): ds.chunk(x=TimeResampler("foo")) @requires_dask def test_dask_is_lazy(self) -> None: store = InaccessibleVariableDataStore() create_test_data().dump_to_store(store) ds = open_dataset(store).chunk() with pytest.raises(UnexpectedDataAccess): ds.load() with pytest.raises(UnexpectedDataAccess): _ = ds["var1"].values # these should not raise UnexpectedDataAccess: _ = ds.var1.data ds.isel(time=10) ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) ds.transpose() ds.mean() ds.fillna(0) ds.rename({"dim1": "foobar"}) ds.set_coords("var1") ds.drop_vars("var1") def test_isel(self) -> None: data = create_test_data() slicers: dict[Hashable, slice] = { "dim1": slice(None, None, 2), "dim2": slice(0, 2), } ret = data.isel(slicers) # Verify that only the specified dimension was altered assert list(data.dims) == list(ret.dims) for d in data.dims: if d in slicers: assert ret.sizes[d] == np.arange(data.sizes[d])[slicers[d]].size else: assert data.sizes[d] == ret.sizes[d] # Verify that the data is what we expect for v in data.variables: assert data[v].dims == ret[v].dims assert data[v].attrs == ret[v].attrs slice_list = [slice(None)] * data[v].values.ndim for d, s in slicers.items(): if d in data[v].dims: inds = np.nonzero(np.array(data[v].dims) == d)[0] for ind in inds: slice_list[ind] = s expected = data[v].values[tuple(slice_list)] actual = ret[v].values np.testing.assert_array_equal(expected, actual) with pytest.raises(ValueError): data.isel(not_a_dim=slice(0, 2)) with pytest.raises( ValueError, match=r"Dimensions {'not_a_dim'} do not exist. Expected " r"one or more of " r"[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*'time'[\w\W]*'dim\d'[\w\W]*", ): data.isel(not_a_dim=slice(0, 2)) with pytest.warns( UserWarning, match=r"Dimensions {'not_a_dim'} do not exist. " r"Expected one or more of " r"[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*'time'[\w\W]*'dim\d'[\w\W]*", ): data.isel(not_a_dim=slice(0, 2), missing_dims="warn") assert_identical(data, data.isel(not_a_dim=slice(0, 2), missing_dims="ignore")) ret = data.isel(dim1=0) assert {"time": 20, "dim2": 9, "dim3": 10} == ret.sizes assert set(data.data_vars) == set(ret.data_vars) assert set(data.coords) == set(ret.coords) assert set(data.xindexes) == set(ret.xindexes) ret = data.isel(time=slice(2), dim1=0, dim2=slice(5)) assert {"time": 2, "dim2": 5, "dim3": 10} == ret.sizes assert set(data.data_vars) == set(ret.data_vars) assert set(data.coords) == set(ret.coords) assert set(data.xindexes) == set(ret.xindexes) ret = data.isel(time=0, dim1=0, dim2=slice(5)) assert {"dim2": 5, "dim3": 10} == ret.sizes assert set(data.data_vars) == set(ret.data_vars) assert set(data.coords) == set(ret.coords) assert set(data.xindexes) == set(list(ret.xindexes) + ["time"]) def test_isel_fancy(self) -> None: # isel with fancy indexing. data = create_test_data() pdim1 = [1, 2, 3] pdim2 = [4, 5, 1] pdim3 = [1, 2, 3] actual = data.isel( dim1=(("test_coord",), pdim1), dim2=(("test_coord",), pdim2), dim3=(("test_coord",), pdim3), ) assert "test_coord" in actual.dims assert actual.coords["test_coord"].shape == (len(pdim1),) # Should work with DataArray actual = data.isel( dim1=DataArray(pdim1, dims="test_coord"), dim2=(("test_coord",), pdim2), dim3=(("test_coord",), pdim3), ) assert "test_coord" in actual.dims assert actual.coords["test_coord"].shape == (len(pdim1),) expected = data.isel( dim1=(("test_coord",), pdim1), dim2=(("test_coord",), pdim2), dim3=(("test_coord",), pdim3), ) assert_identical(actual, expected) # DataArray with coordinate idx1 = DataArray(pdim1, dims=["a"], coords={"a": np.random.randn(3)}) idx2 = DataArray(pdim2, dims=["b"], coords={"b": np.random.randn(3)}) idx3 = DataArray(pdim3, dims=["c"], coords={"c": np.random.randn(3)}) # Should work with DataArray actual = data.isel(dim1=idx1, dim2=idx2, dim3=idx3) assert "a" in actual.dims assert "b" in actual.dims assert "c" in actual.dims assert "time" in actual.coords assert "dim2" in actual.coords assert "dim3" in actual.coords expected = data.isel( dim1=(("a",), pdim1), dim2=(("b",), pdim2), dim3=(("c",), pdim3) ) expected = expected.assign_coords(a=idx1["a"], b=idx2["b"], c=idx3["c"]) assert_identical(actual, expected) idx1 = DataArray(pdim1, dims=["a"], coords={"a": np.random.randn(3)}) idx2 = DataArray(pdim2, dims=["a"]) idx3 = DataArray(pdim3, dims=["a"]) # Should work with DataArray actual = data.isel(dim1=idx1, dim2=idx2, dim3=idx3) assert "a" in actual.dims assert "time" in actual.coords assert "dim2" in actual.coords assert "dim3" in actual.coords expected = data.isel( dim1=(("a",), pdim1), dim2=(("a",), pdim2), dim3=(("a",), pdim3) ) expected = expected.assign_coords(a=idx1["a"]) assert_identical(actual, expected) actual = data.isel(dim1=(("points",), pdim1), dim2=(("points",), pdim2)) assert "points" in actual.dims assert "dim3" in actual.dims assert "dim3" not in actual.data_vars np.testing.assert_array_equal(data["dim2"][pdim2], actual["dim2"]) # test that the order of the indexers doesn't matter assert_identical( data.isel(dim1=(("points",), pdim1), dim2=(("points",), pdim2)), data.isel(dim2=(("points",), pdim2), dim1=(("points",), pdim1)), ) # make sure we're raising errors in the right places with pytest.raises(IndexError, match=r"Dimensions of indexers mismatch"): data.isel(dim1=(("points",), [1, 2]), dim2=(("points",), [1, 2, 3])) with pytest.raises(TypeError, match=r"cannot use a Dataset"): data.isel(dim1=Dataset({"points": [1, 2]})) # test to be sure we keep around variables that were not indexed ds = Dataset({"x": [1, 2, 3, 4], "y": 0}) actual = ds.isel(x=(("points",), [0, 1, 2])) assert_identical(ds["y"], actual["y"]) # tests using index or DataArray as indexers stations = Dataset() stations["station"] = (("station",), ["A", "B", "C"]) stations["dim1s"] = (("station",), [1, 2, 3]) stations["dim2s"] = (("station",), [4, 5, 1]) actual = data.isel(dim1=stations["dim1s"], dim2=stations["dim2s"]) assert "station" in actual.coords assert "station" in actual.dims assert_identical(actual["station"].drop_vars(["dim2"]), stations["station"]) with pytest.raises(ValueError, match=r"conflicting values/indexes on "): data.isel( dim1=DataArray( [0, 1, 2], dims="station", coords={"station": [0, 1, 2]} ), dim2=DataArray( [0, 1, 2], dims="station", coords={"station": [0, 1, 3]} ), ) # multi-dimensional selection stations = Dataset() stations["a"] = (("a",), ["A", "B", "C"]) stations["b"] = (("b",), [0, 1]) stations["dim1s"] = (("a", "b"), [[1, 2], [2, 3], [3, 4]]) stations["dim2s"] = (("a",), [4, 5, 1]) actual = data.isel(dim1=stations["dim1s"], dim2=stations["dim2s"]) assert "a" in actual.coords assert "a" in actual.dims assert "b" in actual.coords assert "b" in actual.dims assert "dim2" in actual.coords assert "a" in actual["dim2"].dims assert_identical(actual["a"].drop_vars(["dim2"]), stations["a"]) assert_identical(actual["b"], stations["b"]) expected_var1 = data["var1"].variable[ stations["dim1s"].variable, stations["dim2s"].variable ] expected_var2 = data["var2"].variable[ stations["dim1s"].variable, stations["dim2s"].variable ] expected_var3 = data["var3"].variable[slice(None), stations["dim1s"].variable] assert_equal(actual["a"].drop_vars("dim2"), stations["a"]) assert_array_equal(actual["var1"], expected_var1) assert_array_equal(actual["var2"], expected_var2) assert_array_equal(actual["var3"], expected_var3) # test that drop works ds = xr.Dataset({"a": (("x",), [1, 2, 3])}, coords={"b": (("x",), [5, 6, 7])}) actual = ds.isel({"x": 1}, drop=False) expected = xr.Dataset({"a": 2}, coords={"b": 6}) assert_identical(actual, expected) actual = ds.isel({"x": 1}, drop=True) expected = xr.Dataset({"a": 2}) assert_identical(actual, expected) actual = ds.isel({"x": DataArray(1)}, drop=False) expected = xr.Dataset({"a": 2}, coords={"b": 6}) assert_identical(actual, expected) actual = ds.isel({"x": DataArray(1)}, drop=True) expected = xr.Dataset({"a": 2}) assert_identical(actual, expected) def test_isel_dataarray(self) -> None: """Test for indexing by DataArray""" data = create_test_data() # indexing with DataArray with same-name coordinates. indexing_da = DataArray( np.arange(1, 4), dims=["dim1"], coords={"dim1": np.random.randn(3)} ) actual = data.isel(dim1=indexing_da) assert_identical(indexing_da["dim1"], actual["dim1"]) assert_identical(data["dim2"], actual["dim2"]) # Conflict in the dimension coordinate indexing_da = DataArray( np.arange(1, 4), dims=["dim2"], coords={"dim2": np.random.randn(3)} ) with pytest.raises(IndexError, match=r"dimension coordinate 'dim2'"): data.isel(dim2=indexing_da) # Also the case for DataArray with pytest.raises(IndexError, match=r"dimension coordinate 'dim2'"): data["var2"].isel(dim2=indexing_da) with pytest.raises(IndexError, match=r"dimension coordinate 'dim2'"): data["dim2"].isel(dim2=indexing_da) # same name coordinate which does not conflict indexing_da = DataArray( np.arange(1, 4), dims=["dim2"], coords={"dim2": data["dim2"].values[1:4]} ) actual = data.isel(dim2=indexing_da) assert_identical(actual["dim2"], indexing_da["dim2"]) # Silently drop conflicted (non-dimensional) coordinate of indexer indexing_da = DataArray( np.arange(1, 4), dims=["dim2"], coords={ "dim2": data["dim2"].values[1:4], "numbers": ("dim2", np.arange(2, 5)), }, ) actual = data.isel(dim2=indexing_da) assert_identical(actual["numbers"], data["numbers"]) # boolean data array with coordinate with the same name indexing_da = DataArray( np.arange(1, 10), dims=["dim2"], coords={"dim2": data["dim2"].values} ) indexing_da = indexing_da < 3 actual = data.isel(dim2=indexing_da) assert_identical(actual["dim2"], data["dim2"][:2]) # boolean data array with non-dimensioncoordinate indexing_da = DataArray( np.arange(1, 10), dims=["dim2"], coords={ "dim2": data["dim2"].values, "non_dim": (("dim2",), np.random.randn(9)), "non_dim2": 0, }, ) indexing_da = indexing_da < 3 actual = data.isel(dim2=indexing_da) assert_identical( actual["dim2"].drop_vars("non_dim").drop_vars("non_dim2"), data["dim2"][:2] ) assert_identical(actual["non_dim"], indexing_da["non_dim"][:2]) assert_identical(actual["non_dim2"], indexing_da["non_dim2"]) # non-dimension coordinate will be also attached indexing_da = DataArray( np.arange(1, 4), dims=["dim2"], coords={"non_dim": (("dim2",), np.random.randn(3))}, ) actual = data.isel(dim2=indexing_da) assert "non_dim" in actual assert "non_dim" in actual.coords # Index by a scalar DataArray indexing_da = DataArray(3, dims=[], coords={"station": 2}) actual = data.isel(dim2=indexing_da) assert "station" in actual actual = data.isel(dim2=indexing_da["station"]) assert "station" in actual # indexer generated from coordinates indexing_ds = Dataset({}, coords={"dim2": [0, 1, 2]}) with pytest.raises(IndexError, match=r"dimension coordinate 'dim2'"): actual = data.isel(dim2=indexing_ds["dim2"]) def test_isel_fancy_convert_index_variable(self) -> None: # select index variable "x" with a DataArray of dim "z" # -> drop index and convert index variable to base variable ds = xr.Dataset({"foo": ("x", [1, 2, 3])}, coords={"x": [0, 1, 2]}) idxr = xr.DataArray([1], dims="z", name="x") actual = ds.isel(x=idxr) assert "x" not in actual.xindexes assert not isinstance(actual.x.variable, IndexVariable) def test_sel(self) -> None: data = create_test_data() int_slicers = {"dim1": slice(None, None, 2), "dim2": slice(2), "dim3": slice(3)} loc_slicers = { "dim1": slice(None, None, 2), "dim2": slice(0, 0.5), "dim3": slice("a", "c"), } assert_equal(data.isel(int_slicers), data.sel(loc_slicers)) data["time"] = ("time", pd.date_range("2000-01-01", periods=20)) assert_equal(data.isel(time=0), data.sel(time="2000-01-01")) assert_equal( data.isel(time=slice(10)), data.sel(time=slice("2000-01-01", "2000-01-10")) ) assert_equal(data, data.sel(time=slice("1999", "2005"))) times = pd.date_range("2000-01-01", periods=3) assert_equal(data.isel(time=slice(3)), data.sel(time=times)) assert_equal( data.isel(time=slice(3)), data.sel(time=(data["time.dayofyear"] <= 3)) ) td = pd.to_timedelta(np.arange(3), unit="days") data = Dataset({"x": ("td", np.arange(3)), "td": td}) assert_equal(data, data.sel(td=td)) assert_equal(data, data.sel(td=slice("3 days"))) assert_equal(data.isel(td=0), data.sel(td=pd.Timedelta("0 days"))) assert_equal(data.isel(td=0), data.sel(td=pd.Timedelta("0h"))) assert_equal(data.isel(td=slice(1, 3)), data.sel(td=slice("1 days", "2 days"))) def test_sel_dataarray(self) -> None: data = create_test_data() ind = DataArray([0.0, 0.5, 1.0], dims=["dim2"]) actual = data.sel(dim2=ind) assert_equal(actual, data.isel(dim2=[0, 1, 2])) # with different dimension ind = DataArray([0.0, 0.5, 1.0], dims=["new_dim"]) actual = data.sel(dim2=ind) expected = data.isel(dim2=Variable("new_dim", [0, 1, 2])) assert "new_dim" in actual.dims assert_equal(actual, expected) # Multi-dimensional ind = DataArray([[0.0], [0.5], [1.0]], dims=["new_dim", "new_dim2"]) actual = data.sel(dim2=ind) expected = data.isel(dim2=Variable(("new_dim", "new_dim2"), [[0], [1], [2]])) assert "new_dim" in actual.dims assert "new_dim2" in actual.dims assert_equal(actual, expected) # with coordinate ind = DataArray( [0.0, 0.5, 1.0], dims=["new_dim"], coords={"new_dim": ["a", "b", "c"]} ) actual = data.sel(dim2=ind) expected = data.isel(dim2=[0, 1, 2]).rename({"dim2": "new_dim"}) assert "new_dim" in actual.dims assert "new_dim" in actual.coords assert_equal( actual.drop_vars("new_dim").drop_vars("dim2"), expected.drop_vars("new_dim") ) assert_equal(actual["new_dim"].drop_vars("dim2"), ind["new_dim"]) # with conflicted coordinate (silently ignored) ind = DataArray( [0.0, 0.5, 1.0], dims=["dim2"], coords={"dim2": ["a", "b", "c"]} ) actual = data.sel(dim2=ind) expected = data.isel(dim2=[0, 1, 2]) assert_equal(actual, expected) # with conflicted coordinate (silently ignored) ind = DataArray( [0.0, 0.5, 1.0], dims=["new_dim"], coords={"new_dim": ["a", "b", "c"], "dim2": 3}, ) actual = data.sel(dim2=ind) assert_equal( actual["new_dim"].drop_vars("dim2"), ind["new_dim"].drop_vars("dim2") ) expected = data.isel(dim2=[0, 1, 2]) expected["dim2"] = (("new_dim"), expected["dim2"].values) assert_equal(actual["dim2"].drop_vars("new_dim"), expected["dim2"]) assert actual["var1"].dims == ("dim1", "new_dim") # with non-dimensional coordinate ind = DataArray( [0.0, 0.5, 1.0], dims=["dim2"], coords={ "dim2": ["a", "b", "c"], "numbers": ("dim2", [0, 1, 2]), "new_dim": ("dim2", [1.1, 1.2, 1.3]), }, ) actual = data.sel(dim2=ind) expected = data.isel(dim2=[0, 1, 2]) assert_equal(actual.drop_vars("new_dim"), expected) assert np.allclose(actual["new_dim"].values, ind["new_dim"].values) def test_sel_dataarray_mindex(self) -> None: midx = pd.MultiIndex.from_product([list("abc"), [0, 1]], names=("one", "two")) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") midx_coords["y"] = range(3) mds = xr.Dataset( {"var": (("x", "y"), np.random.rand(6, 3))}, coords=midx_coords ) actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims="x")) actual_sel = mds.sel(x=DataArray(midx[:3], dims="x")) assert actual_isel["x"].dims == ("x",) assert actual_sel["x"].dims == ("x",) assert_identical(actual_isel, actual_sel) actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims="z")) actual_sel = mds.sel(x=Variable("z", midx[:3])) assert actual_isel["x"].dims == ("z",) assert actual_sel["x"].dims == ("z",) assert_identical(actual_isel, actual_sel) # with coordinate actual_isel = mds.isel( x=xr.DataArray(np.arange(3), dims="z", coords={"z": [0, 1, 2]}) ) actual_sel = mds.sel( x=xr.DataArray(midx[:3], dims="z", coords={"z": [0, 1, 2]}) ) assert actual_isel["x"].dims == ("z",) assert actual_sel["x"].dims == ("z",) assert_identical(actual_isel, actual_sel) # Vectorized indexing with level-variables raises an error with pytest.raises(ValueError, match=r"Vectorized selection is "): mds.sel(one=["a", "b"]) with pytest.raises( ValueError, match=r"Vectorized selection is not available along coordinate 'x' with a multi-index", ): mds.sel( x=xr.DataArray( [np.array(midx[:2]), np.array(midx[-2:])], dims=["a", "b"] ) ) def test_sel_categorical(self) -> None: ind = pd.Series(["foo", "bar"], dtype="category") df = pd.DataFrame({"ind": ind, "values": [1, 2]}) ds = df.set_index("ind").to_xarray() actual = ds.sel(ind="bar") expected = ds.isel(ind=1) assert_identical(expected, actual) def test_sel_categorical_error(self) -> None: ind = pd.Series(["foo", "bar"], dtype="category") df = pd.DataFrame({"ind": ind, "values": [1, 2]}) ds = df.set_index("ind").to_xarray() with pytest.raises(ValueError): ds.sel(ind="bar", method="nearest") with pytest.raises(ValueError): ds.sel(ind="bar", tolerance="nearest") # type: ignore[arg-type] def test_categorical_index(self) -> None: cat = pd.CategoricalIndex( ["foo", "bar", "foo"], categories=["foo", "bar", "baz", "qux", "quux", "corge"], ) ds = xr.Dataset( {"var": ("cat", np.arange(3))}, coords={"cat": ("cat", cat), "c": ("cat", [0, 1, 1])}, ) # test slice actual1 = ds.sel(cat="foo") expected1 = ds.isel(cat=[0, 2]) assert_identical(expected1, actual1) # make sure the conversion to the array works actual2 = ds.sel(cat="foo")["cat"].values assert (actual2 == np.array(["foo", "foo"])).all() ds = ds.set_index(index=["cat", "c"]) actual3 = ds.unstack("index") assert actual3["var"].shape == (2, 2) def test_categorical_reindex(self) -> None: cat = pd.CategoricalIndex( ["foo", "bar", "baz"], categories=["foo", "bar", "baz", "qux", "quux", "corge"], ) ds = xr.Dataset( {"var": ("cat", np.arange(3))}, coords={"cat": ("cat", cat), "c": ("cat", [0, 1, 2])}, ) actual = ds.reindex(cat=["foo"])["cat"].values assert (actual == np.array(["foo"])).all() def test_categorical_multiindex(self) -> None: i1 = pd.Series([0, 0]) cat = pd.CategoricalDtype(categories=["foo", "baz", "bar"]) i2 = pd.Series(["baz", "bar"], dtype=cat) df = pd.DataFrame({"i1": i1, "i2": i2, "values": [1, 2]}).set_index( ["i1", "i2"] ) actual = df.to_xarray() assert actual["values"].shape == (1, 2) def test_sel_drop(self) -> None: data = Dataset({"foo": ("x", [1, 2, 3])}, {"x": [0, 1, 2]}) expected = Dataset({"foo": 1}) selected = data.sel(x=0, drop=True) assert_identical(expected, selected) expected = Dataset({"foo": 1}, {"x": 0}) selected = data.sel(x=0, drop=False) assert_identical(expected, selected) data = Dataset({"foo": ("x", [1, 2, 3])}) expected = Dataset({"foo": 1}) selected = data.sel(x=0, drop=True) assert_identical(expected, selected) def test_sel_drop_mindex(self) -> None: midx = pd.MultiIndex.from_arrays([["a", "a"], [1, 2]], names=("foo", "bar")) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") data = Dataset(coords=midx_coords) actual = data.sel(foo="a", drop=True) assert "foo" not in actual.coords actual = data.sel(foo="a", drop=False) assert_equal(actual.foo, DataArray("a", coords={"foo": "a"})) def test_isel_drop(self) -> None: data = Dataset({"foo": ("x", [1, 2, 3])}, {"x": [0, 1, 2]}) expected = Dataset({"foo": 1}) selected = data.isel(x=0, drop=True) assert_identical(expected, selected) expected = Dataset({"foo": 1}, {"x": 0}) selected = data.isel(x=0, drop=False) assert_identical(expected, selected) def test_head(self) -> None: data = create_test_data() expected = data.isel(time=slice(5), dim2=slice(6)) actual = data.head(time=5, dim2=6) assert_equal(expected, actual) expected = data.isel(time=slice(0)) actual = data.head(time=0) assert_equal(expected, actual) expected = data.isel({dim: slice(6) for dim in data.dims}) actual = data.head(6) assert_equal(expected, actual) expected = data.isel({dim: slice(5) for dim in data.dims}) actual = data.head() assert_equal(expected, actual) with pytest.raises(TypeError, match=r"either dict-like or a single int"): data.head([3]) # type: ignore[arg-type] with pytest.raises(TypeError, match=r"expected integer type"): data.head(dim2=3.1) with pytest.raises(ValueError, match=r"expected positive int"): data.head(time=-3) def test_tail(self) -> None: data = create_test_data() expected = data.isel(time=slice(-5, None), dim2=slice(-6, None)) actual = data.tail(time=5, dim2=6) assert_equal(expected, actual) expected = data.isel(dim1=slice(0)) actual = data.tail(dim1=0) assert_equal(expected, actual) expected = data.isel({dim: slice(-6, None) for dim in data.dims}) actual = data.tail(6) assert_equal(expected, actual) expected = data.isel({dim: slice(-5, None) for dim in data.dims}) actual = data.tail() assert_equal(expected, actual) with pytest.raises(TypeError, match=r"either dict-like or a single int"): data.tail([3]) # type: ignore[arg-type] with pytest.raises(TypeError, match=r"expected integer type"): data.tail(dim2=3.1) with pytest.raises(ValueError, match=r"expected positive int"): data.tail(time=-3) def test_thin(self) -> None: data = create_test_data() expected = data.isel(time=slice(None, None, 5), dim2=slice(None, None, 6)) actual = data.thin(time=5, dim2=6) assert_equal(expected, actual) expected = data.isel({dim: slice(None, None, 6) for dim in data.dims}) actual = data.thin(6) assert_equal(expected, actual) with pytest.raises(TypeError, match=r"either dict-like or a single int"): data.thin([3]) # type: ignore[arg-type] with pytest.raises(TypeError, match=r"expected integer type"): data.thin(dim2=3.1) with pytest.raises(ValueError, match=r"cannot be zero"): data.thin(time=0) with pytest.raises(ValueError, match=r"expected positive int"): data.thin(time=-3) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sel_fancy(self) -> None: data = create_test_data() # add in a range() index data["dim1"] = data.dim1 pdim1 = [1, 2, 3] pdim2 = [4, 5, 1] pdim3 = [1, 2, 3] expected = data.isel( dim1=Variable(("test_coord",), pdim1), dim2=Variable(("test_coord",), pdim2), dim3=Variable(("test_coord"), pdim3), ) actual = data.sel( dim1=Variable(("test_coord",), data.dim1[pdim1]), dim2=Variable(("test_coord",), data.dim2[pdim2]), dim3=Variable(("test_coord",), data.dim3[pdim3]), ) assert_identical(expected, actual) # DataArray Indexer idx_t = DataArray( data["time"][[3, 2, 1]].values, dims=["a"], coords={"a": ["a", "b", "c"]} ) idx_2 = DataArray( data["dim2"][[3, 2, 1]].values, dims=["a"], coords={"a": ["a", "b", "c"]} ) idx_3 = DataArray( data["dim3"][[3, 2, 1]].values, dims=["a"], coords={"a": ["a", "b", "c"]} ) actual = data.sel(time=idx_t, dim2=idx_2, dim3=idx_3) expected = data.isel( time=Variable(("a",), [3, 2, 1]), dim2=Variable(("a",), [3, 2, 1]), dim3=Variable(("a",), [3, 2, 1]), ) expected = expected.assign_coords(a=idx_t["a"]) assert_identical(expected, actual) idx_t = DataArray( data["time"][[3, 2, 1]].values, dims=["a"], coords={"a": ["a", "b", "c"]} ) idx_2 = DataArray( data["dim2"][[2, 1, 3]].values, dims=["b"], coords={"b": [0, 1, 2]} ) idx_3 = DataArray( data["dim3"][[1, 2, 1]].values, dims=["c"], coords={"c": [0.0, 1.1, 2.2]} ) actual = data.sel(time=idx_t, dim2=idx_2, dim3=idx_3) expected = data.isel( time=Variable(("a",), [3, 2, 1]), dim2=Variable(("b",), [2, 1, 3]), dim3=Variable(("c",), [1, 2, 1]), ) expected = expected.assign_coords(a=idx_t["a"], b=idx_2["b"], c=idx_3["c"]) assert_identical(expected, actual) # test from sel_points data = Dataset({"foo": (("x", "y"), np.arange(9).reshape(3, 3))}) data.coords.update({"x": [0, 1, 2], "y": [0, 1, 2]}) expected = Dataset( {"foo": ("points", [0, 4, 8])}, coords={ "x": Variable(("points",), [0, 1, 2]), "y": Variable(("points",), [0, 1, 2]), }, ) actual = data.sel( x=Variable(("points",), [0, 1, 2]), y=Variable(("points",), [0, 1, 2]) ) assert_identical(expected, actual) expected.coords.update({"x": ("points", [0, 1, 2]), "y": ("points", [0, 1, 2])}) actual = data.sel( x=Variable(("points",), [0.1, 1.1, 2.5]), y=Variable(("points",), [0, 1.2, 2.0]), method="pad", ) assert_identical(expected, actual) idx_x = DataArray([0, 1, 2], dims=["a"], coords={"a": ["a", "b", "c"]}) idx_y = DataArray([0, 2, 1], dims=["b"], coords={"b": [0, 3, 6]}) expected_ary = data["foo"][[0, 1, 2], [0, 2, 1]] actual = data.sel(x=idx_x, y=idx_y) assert_array_equal(expected_ary, actual["foo"]) assert_identical(actual["a"].drop_vars("x"), idx_x["a"]) assert_identical(actual["b"].drop_vars("y"), idx_y["b"]) with pytest.raises(KeyError): data.sel(x=[2.5], y=[2.0], method="pad", tolerance=1e-3) def test_sel_method(self) -> None: data = create_test_data() expected = data.sel(dim2=1) actual = data.sel(dim2=0.95, method="nearest") assert_identical(expected, actual) actual = data.sel(dim2=0.95, method="nearest", tolerance=1) assert_identical(expected, actual) with pytest.raises(KeyError): actual = data.sel(dim2=np.pi, method="nearest", tolerance=0) expected = data.sel(dim2=[1.5]) actual = data.sel(dim2=[1.45], method="backfill") assert_identical(expected, actual) with pytest.raises(NotImplementedError, match=r"slice objects"): data.sel(dim2=slice(1, 3), method="ffill") with pytest.raises(TypeError, match=r"``method``"): # this should not pass silently data.sel(dim2=1, method=data) # type: ignore[arg-type] # cannot pass method if there is no associated coordinate with pytest.raises(ValueError, match=r"cannot supply"): data.sel(dim1=0, method="nearest") def test_loc(self) -> None: data = create_test_data() expected = data.sel(dim3="a") actual = data.loc[dict(dim3="a")] assert_identical(expected, actual) with pytest.raises(TypeError, match=r"can only lookup dict"): data.loc["a"] # type: ignore[index] def test_selection_multiindex(self) -> None: midx = pd.MultiIndex.from_product( [["a", "b"], [1, 2], [-1, -2]], names=("one", "two", "three") ) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") mdata = Dataset(data_vars={"var": ("x", range(8))}, coords=midx_coords) def test_sel( lab_indexer, pos_indexer, replaced_idx=False, renamed_dim=None ) -> None: ds = mdata.sel(x=lab_indexer) expected_ds = mdata.isel(x=pos_indexer) if not replaced_idx: assert_identical(ds, expected_ds) else: if renamed_dim: assert ds["var"].dims[0] == renamed_dim ds = ds.rename({renamed_dim: "x"}) assert_identical(ds["var"].variable, expected_ds["var"].variable) assert not ds["x"].equals(expected_ds["x"]) test_sel(("a", 1, -1), 0) test_sel(("b", 2, -2), -1) test_sel(("a", 1), [0, 1], replaced_idx=True, renamed_dim="three") test_sel(("a",), range(4), replaced_idx=True) test_sel("a", range(4), replaced_idx=True) test_sel([("a", 1, -1), ("b", 2, -2)], [0, 7]) test_sel(slice("a", "b"), range(8)) test_sel(slice(("a", 1), ("b", 1)), range(6)) test_sel({"one": "a", "two": 1, "three": -1}, 0) test_sel({"one": "a", "two": 1}, [0, 1], replaced_idx=True, renamed_dim="three") test_sel({"one": "a"}, range(4), replaced_idx=True) assert_identical(mdata.loc[{"x": {"one": "a"}}], mdata.sel(x={"one": "a"})) assert_identical(mdata.loc[{"x": "a"}], mdata.sel(x="a")) assert_identical(mdata.loc[{"x": ("a", 1)}], mdata.sel(x=("a", 1))) assert_identical(mdata.loc[{"x": ("a", 1, -1)}], mdata.sel(x=("a", 1, -1))) assert_identical(mdata.sel(x={"one": "a", "two": 1}), mdata.sel(one="a", two=1)) def test_broadcast_like(self) -> None: original1 = DataArray( np.random.randn(5), [("x", range(5))], name="a" ).to_dataset() original2 = DataArray(np.random.randn(6), [("y", range(6))], name="b") expected1, expected2 = broadcast(original1, original2) assert_identical( original1.broadcast_like(original2), expected1.transpose("y", "x") ) assert_identical(original2.broadcast_like(original1), expected2) def test_to_pandas(self) -> None: # 0D -> series actual = Dataset({"a": 1, "b": 2}).to_pandas() expected = pd.Series([1, 2], ["a", "b"]) assert_array_equal(actual, expected) # 1D -> dataframe x = np.random.randn(10) y = np.random.randn(10) t = list("abcdefghij") ds = Dataset({"a": ("t", x), "b": ("t", y), "t": ("t", t)}) actual_df = ds.to_pandas() expected_df = ds.to_dataframe() assert expected_df.equals(actual_df), (expected_df, actual_df) # 2D -> error x2d = np.random.randn(10, 10) y2d = np.random.randn(10, 10) with pytest.raises(ValueError, match=r"cannot convert Datasets"): Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas() def test_reindex_like(self) -> None: data = create_test_data() data["letters"] = ("dim3", 10 * ["a"]) expected = data.isel(dim1=slice(10), time=slice(13)) actual = data.reindex_like(expected) assert_identical(actual, expected) expected = data.copy(deep=True) expected["dim3"] = ("dim3", list("cdefghijkl")) expected["var3"][:-2] = expected["var3"][2:].values expected["var3"][-2:] = np.nan expected["letters"] = expected["letters"].astype(object) expected["letters"][-2:] = np.nan expected["numbers"] = expected["numbers"].astype(float) expected["numbers"][:-2] = expected["numbers"][2:].values expected["numbers"][-2:] = np.nan actual = data.reindex_like(expected) assert_identical(actual, expected) def test_reindex(self) -> None: data = create_test_data() assert_identical(data, data.reindex()) expected = data.assign_coords(dim1=data["dim1"]) actual = data.reindex(dim1=data["dim1"]) assert_identical(actual, expected) actual = data.reindex(dim1=data["dim1"].values) assert_identical(actual, expected) actual = data.reindex(dim1=data["dim1"].to_index()) assert_identical(actual, expected) with pytest.raises( ValueError, match=r"cannot reindex or align along dimension" ): data.reindex(dim1=data["dim1"][:5]) expected = data.isel(dim2=slice(5)) actual = data.reindex(dim2=data["dim2"][:5]) assert_identical(actual, expected) # test dict-like argument actual = data.reindex({"dim2": data["dim2"]}) expected = data assert_identical(actual, expected) with pytest.raises(ValueError, match=r"cannot specify both"): data.reindex({"x": 0}, x=0) with pytest.raises(ValueError, match=r"dictionary"): data.reindex("foo") # type: ignore[arg-type] # invalid dimension # TODO: (benbovy - explicit indexes): uncomment? # --> from reindex docstrings: "any mis-matched dimension is simply ignored" # with pytest.raises(ValueError, match=r"indexer keys.*not correspond.*"): # data.reindex(invalid=0) # out of order expected = data.sel(dim2=data["dim2"][:5:-1]) actual = data.reindex(dim2=data["dim2"][:5:-1]) assert_identical(actual, expected) # multiple fill values expected = data.reindex(dim2=[0.1, 2.1, 3.1, 4.1]).assign( var1=lambda ds: ds.var1.copy(data=[[-10, -10, -10, -10]] * len(ds.dim1)), var2=lambda ds: ds.var2.copy(data=[[-20, -20, -20, -20]] * len(ds.dim1)), ) actual = data.reindex( dim2=[0.1, 2.1, 3.1, 4.1], fill_value={"var1": -10, "var2": -20} ) assert_identical(actual, expected) # use the default value expected = data.reindex(dim2=[0.1, 2.1, 3.1, 4.1]).assign( var1=lambda ds: ds.var1.copy(data=[[-10, -10, -10, -10]] * len(ds.dim1)), var2=lambda ds: ds.var2.copy( data=[[np.nan, np.nan, np.nan, np.nan]] * len(ds.dim1) ), ) actual = data.reindex(dim2=[0.1, 2.1, 3.1, 4.1], fill_value={"var1": -10}) assert_identical(actual, expected) # regression test for #279 expected = Dataset({"x": ("time", np.random.randn(5))}, {"time": range(5)}) time2 = DataArray(np.arange(5), dims="time2") with pytest.raises(ValueError): actual = expected.reindex(time=time2) # another regression test ds = Dataset( {"foo": (["x", "y"], np.zeros((3, 4)))}, {"x": range(3), "y": range(4)} ) expected = Dataset( {"foo": (["x", "y"], np.zeros((3, 2)))}, {"x": [0, 1, 3], "y": [0, 1]} ) expected["foo"][-1] = np.nan actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) assert_identical(expected, actual) def test_reindex_attrs_encoding(self) -> None: ds = Dataset( {"data": ("x", [1, 2, 3])}, {"x": ("x", [0, 1, 2], {"foo": "bar"}, {"bar": "baz"})}, ) actual = ds.reindex(x=[0, 1]) expected = Dataset( {"data": ("x", [1, 2])}, {"x": ("x", [0, 1], {"foo": "bar"}, {"bar": "baz"})}, ) assert_identical(actual, expected) assert actual.x.encoding == expected.x.encoding def test_reindex_warning(self) -> None: data = create_test_data() with pytest.raises(ValueError): # DataArray with different dimension raises Future warning ind = xr.DataArray([0.0, 1.0], dims=["new_dim"], name="ind") data.reindex(dim2=ind) # Should not warn ind = xr.DataArray([0.0, 1.0], dims=["dim2"], name="ind") with warnings.catch_warnings(record=True) as ws: data.reindex(dim2=ind) assert len(ws) == 0 def test_reindex_variables_copied(self) -> None: data = create_test_data() reindexed_data = data.reindex(copy=False) for k in data.variables: assert reindexed_data.variables[k] is not data.variables[k] def test_reindex_method(self) -> None: ds = Dataset({"x": ("y", [10, 20]), "y": [0, 1]}) y = [-0.5, 0.5, 1.5] actual = ds.reindex(y=y, method="backfill") expected = Dataset({"x": ("y", [10, 20, np.nan]), "y": y}) assert_identical(expected, actual) actual = ds.reindex(y=y, method="backfill", tolerance=0.1) expected = Dataset({"x": ("y", 3 * [np.nan]), "y": y}) assert_identical(expected, actual) actual = ds.reindex(y=y, method="backfill", tolerance=[0.1, 0.5, 0.1]) expected = Dataset({"x": ("y", [np.nan, 20, np.nan]), "y": y}) assert_identical(expected, actual) actual = ds.reindex(y=[0.1, 0.1, 1], tolerance=[0, 0.1, 0], method="nearest") expected = Dataset({"x": ("y", [np.nan, 10, 20]), "y": [0.1, 0.1, 1]}) assert_identical(expected, actual) actual = ds.reindex(y=y, method="pad") expected = Dataset({"x": ("y", [np.nan, 10, 20]), "y": y}) assert_identical(expected, actual) alt = Dataset({"y": y}) actual = ds.reindex_like(alt, method="pad") assert_identical(expected, actual) @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"x": 2, "z": 1}]) def test_reindex_fill_value(self, fill_value) -> None: ds = Dataset({"x": ("y", [10, 20]), "z": ("y", [-20, -10]), "y": [0, 1]}) y = [0, 1, 2] actual = ds.reindex(y=y, fill_value=fill_value) if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array fill_value_x = fill_value_z = np.nan elif isinstance(fill_value, dict): fill_value_x = fill_value["x"] fill_value_z = fill_value["z"] else: fill_value_x = fill_value_z = fill_value expected = Dataset( { "x": ("y", [10, 20, fill_value_x]), "z": ("y", [-20, -10, fill_value_z]), "y": y, } ) assert_identical(expected, actual) @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"x": 2, "z": 1}]) def test_reindex_like_fill_value(self, fill_value) -> None: ds = Dataset({"x": ("y", [10, 20]), "z": ("y", [-20, -10]), "y": [0, 1]}) y = [0, 1, 2] alt = Dataset({"y": y}) actual = ds.reindex_like(alt, fill_value=fill_value) if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array fill_value_x = fill_value_z = np.nan elif isinstance(fill_value, dict): fill_value_x = fill_value["x"] fill_value_z = fill_value["z"] else: fill_value_x = fill_value_z = fill_value expected = Dataset( { "x": ("y", [10, 20, fill_value_x]), "z": ("y", [-20, -10, fill_value_z]), "y": y, } ) assert_identical(expected, actual) @pytest.mark.parametrize("dtype", [str, bytes]) def test_reindex_str_dtype(self, dtype) -> None: data = Dataset({"data": ("x", [1, 2]), "x": np.array(["a", "b"], dtype=dtype)}) actual = data.reindex(x=data.x) expected = data assert_identical(expected, actual) assert actual.x.dtype == expected.x.dtype @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"foo": 2, "bar": 1}]) def test_align_fill_value(self, fill_value) -> None: x = Dataset({"foo": DataArray([1, 2], dims=["x"], coords={"x": [1, 2]})}) y = Dataset({"bar": DataArray([1, 2], dims=["x"], coords={"x": [1, 3]})}) x2, y2 = align(x, y, join="outer", fill_value=fill_value) if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array fill_value_foo = fill_value_bar = np.nan elif isinstance(fill_value, dict): fill_value_foo = fill_value["foo"] fill_value_bar = fill_value["bar"] else: fill_value_foo = fill_value_bar = fill_value expected_x2 = Dataset( { "foo": DataArray( [1, 2, fill_value_foo], dims=["x"], coords={"x": [1, 2, 3]} ) } ) expected_y2 = Dataset( { "bar": DataArray( [1, fill_value_bar, 2], dims=["x"], coords={"x": [1, 2, 3]} ) } ) assert_identical(expected_x2, x2) assert_identical(expected_y2, y2) def test_align(self) -> None: left = create_test_data() right = left.copy(deep=True) right["dim3"] = ("dim3", list("cdefghijkl")) right["var3"][:-2] = right["var3"][2:].values right["var3"][-2:] = np.random.randn(*right["var3"][-2:].shape) right["numbers"][:-2] = right["numbers"][2:].values right["numbers"][-2:] = -10 intersection = list("cdefghij") union = list("abcdefghijkl") left2, right2 = align(left, right, join="inner") assert_array_equal(left2["dim3"], intersection) assert_identical(left2, right2) left2, right2 = align(left, right, join="outer") assert_array_equal(left2["dim3"], union) assert_equal(left2["dim3"].variable, right2["dim3"].variable) assert_identical(left2.sel(dim3=intersection), right2.sel(dim3=intersection)) assert np.isnan(left2["var3"][-2:]).all() assert np.isnan(right2["var3"][:2]).all() left2, right2 = align(left, right, join="left") assert_equal(left2["dim3"].variable, right2["dim3"].variable) assert_equal(left2["dim3"].variable, left["dim3"].variable) assert_identical(left2.sel(dim3=intersection), right2.sel(dim3=intersection)) assert np.isnan(right2["var3"][:2]).all() left2, right2 = align(left, right, join="right") assert_equal(left2["dim3"].variable, right2["dim3"].variable) assert_equal(left2["dim3"].variable, right["dim3"].variable) assert_identical(left2.sel(dim3=intersection), right2.sel(dim3=intersection)) assert np.isnan(left2["var3"][-2:]).all() with pytest.raises(ValueError, match=r"invalid value for join"): align(left, right, join="foobar") # type: ignore[call-overload] with pytest.raises(TypeError): align(left, right, foo="bar") # type: ignore[call-overload] def test_align_exact(self) -> None: left = xr.Dataset(coords={"x": [0, 1]}) right = xr.Dataset(coords={"x": [1, 2]}) left1, left2 = xr.align(left, left, join="exact") assert_identical(left1, left) assert_identical(left2, left) with pytest.raises(ValueError, match=r"cannot align.*join.*exact.*not equal.*"): xr.align(left, right, join="exact") def test_align_override(self) -> None: left = xr.Dataset(coords={"x": [0, 1, 2]}) right = xr.Dataset(coords={"x": [0.1, 1.1, 2.1], "y": [1, 2, 3]}) expected_right = xr.Dataset(coords={"x": [0, 1, 2], "y": [1, 2, 3]}) new_left, new_right = xr.align(left, right, join="override") assert_identical(left, new_left) assert_identical(new_right, expected_right) new_left, new_right = xr.align(left, right, exclude="x", join="override") assert_identical(left, new_left) assert_identical(right, new_right) new_left, new_right = xr.align( left.isel(x=0, drop=True), right, exclude="x", join="override" ) assert_identical(left.isel(x=0, drop=True), new_left) assert_identical(right, new_right) with pytest.raises( ValueError, match=r"cannot align.*join.*override.*same size" ): xr.align(left.isel(x=0).expand_dims("x"), right, join="override") def test_align_exclude(self) -> None: x = Dataset( { "foo": DataArray( [[1, 2], [3, 4]], dims=["x", "y"], coords={"x": [1, 2], "y": [3, 4]} ) } ) y = Dataset( { "bar": DataArray( [[1, 2], [3, 4]], dims=["x", "y"], coords={"x": [1, 3], "y": [5, 6]} ) } ) x2, y2 = align(x, y, exclude=["y"], join="outer") expected_x2 = Dataset( { "foo": DataArray( [[1, 2], [3, 4], [np.nan, np.nan]], dims=["x", "y"], coords={"x": [1, 2, 3], "y": [3, 4]}, ) } ) expected_y2 = Dataset( { "bar": DataArray( [[1, 2], [np.nan, np.nan], [3, 4]], dims=["x", "y"], coords={"x": [1, 2, 3], "y": [5, 6]}, ) } ) assert_identical(expected_x2, x2) assert_identical(expected_y2, y2) def test_align_nocopy(self) -> None: x = Dataset({"foo": DataArray([1, 2, 3], coords=[("x", [1, 2, 3])])}) y = Dataset({"foo": DataArray([1, 2], coords=[("x", [1, 2])])}) expected_x2 = x expected_y2 = Dataset( {"foo": DataArray([1, 2, np.nan], coords=[("x", [1, 2, 3])])} ) x2, y2 = align(x, y, copy=False, join="outer") assert_identical(expected_x2, x2) assert_identical(expected_y2, y2) assert source_ndarray(x["foo"].data) is source_ndarray(x2["foo"].data) x2, y2 = align(x, y, copy=True, join="outer") assert source_ndarray(x["foo"].data) is not source_ndarray(x2["foo"].data) assert_identical(expected_x2, x2) assert_identical(expected_y2, y2) def test_align_indexes(self) -> None: x = Dataset({"foo": DataArray([1, 2, 3], dims="x", coords=[("x", [1, 2, 3])])}) (x2,) = align(x, indexes={"x": [2, 3, 1]}) expected_x2 = Dataset( {"foo": DataArray([2, 3, 1], dims="x", coords={"x": [2, 3, 1]})} ) assert_identical(expected_x2, x2) def test_align_non_unique(self) -> None: x = Dataset({"foo": ("x", [3, 4, 5]), "x": [0, 0, 1]}) x1, x2 = align(x, x) assert_identical(x1, x) assert_identical(x2, x) y = Dataset({"bar": ("x", [6, 7]), "x": [0, 1]}) with pytest.raises(ValueError, match=r"cannot reindex or align"): align(x, y) def test_align_str_dtype(self) -> None: a = Dataset({"foo": ("x", [0, 1])}, coords={"x": ["a", "b"]}) b = Dataset({"foo": ("x", [1, 2])}, coords={"x": ["b", "c"]}) expected_a = Dataset( {"foo": ("x", [0, 1, np.nan])}, coords={"x": ["a", "b", "c"]} ) expected_b = Dataset( {"foo": ("x", [np.nan, 1, 2])}, coords={"x": ["a", "b", "c"]} ) actual_a, actual_b = xr.align(a, b, join="outer") assert_identical(expected_a, actual_a) assert expected_a.x.dtype == actual_a.x.dtype assert_identical(expected_b, actual_b) assert expected_b.x.dtype == actual_b.x.dtype @pytest.mark.parametrize("join", ["left", "override"]) def test_align_index_var_attrs(self, join) -> None: # regression test https://github.com/pydata/xarray/issues/6852 # aligning two objects should have no side effect on their index variable # metadata. ds = Dataset(coords={"x": ("x", [1, 2, 3], {"units": "m"})}) ds_noattr = Dataset(coords={"x": ("x", [1, 2, 3])}) xr.align(ds_noattr, ds, join=join) assert ds.x.attrs == {"units": "m"} assert ds_noattr.x.attrs == {} def test_broadcast(self) -> None: ds = Dataset( {"foo": 0, "bar": ("x", [1]), "baz": ("y", [2, 3])}, {"c": ("x", [4])} ) expected = Dataset( { "foo": (("x", "y"), [[0, 0]]), "bar": (("x", "y"), [[1, 1]]), "baz": (("x", "y"), [[2, 3]]), }, {"c": ("x", [4])}, ) (actual,) = broadcast(ds) assert_identical(expected, actual) ds_x = Dataset({"foo": ("x", [1])}) ds_y = Dataset({"bar": ("y", [2, 3])}) expected_x = Dataset({"foo": (("x", "y"), [[1, 1]])}) expected_y = Dataset({"bar": (("x", "y"), [[2, 3]])}) actual_x, actual_y = broadcast(ds_x, ds_y) assert_identical(expected_x, actual_x) assert_identical(expected_y, actual_y) array_y = ds_y["bar"] expected_y2 = expected_y["bar"] actual_x2, actual_y2 = broadcast(ds_x, array_y) assert_identical(expected_x, actual_x2) assert_identical(expected_y2, actual_y2) def test_broadcast_nocopy(self) -> None: # Test that data is not copied if not needed x = Dataset({"foo": (("x", "y"), [[1, 1]])}) y = Dataset({"bar": ("y", [2, 3])}) (actual_x,) = broadcast(x) assert_identical(x, actual_x) assert source_ndarray(actual_x["foo"].data) is source_ndarray(x["foo"].data) actual_x, actual_y = broadcast(x, y) assert_identical(x, actual_x) assert source_ndarray(actual_x["foo"].data) is source_ndarray(x["foo"].data) def test_broadcast_exclude(self) -> None: x = Dataset( { "foo": DataArray( [[1, 2], [3, 4]], dims=["x", "y"], coords={"x": [1, 2], "y": [3, 4]} ), "bar": DataArray(5), } ) y = Dataset( { "foo": DataArray( [[1, 2]], dims=["z", "y"], coords={"z": [1], "y": [5, 6]} ) } ) x2, y2 = broadcast(x, y, exclude=["y"]) expected_x2 = Dataset( { "foo": DataArray( [[[1, 2]], [[3, 4]]], dims=["x", "z", "y"], coords={"z": [1], "x": [1, 2], "y": [3, 4]}, ), "bar": DataArray( [[5], [5]], dims=["x", "z"], coords={"x": [1, 2], "z": [1]} ), } ) expected_y2 = Dataset( { "foo": DataArray( [[[1, 2]], [[1, 2]]], dims=["x", "z", "y"], coords={"z": [1], "x": [1, 2], "y": [5, 6]}, ) } ) assert_identical(expected_x2, x2) assert_identical(expected_y2, y2) def test_broadcast_misaligned(self) -> None: x = Dataset({"foo": DataArray([1, 2, 3], coords=[("x", [-1, -2, -3])])}) y = Dataset( { "bar": DataArray( [[1, 2], [3, 4]], dims=["y", "x"], coords={"y": [1, 2], "x": [10, -3]}, ) } ) x2, y2 = broadcast(x, y) expected_x2 = Dataset( { "foo": DataArray( [[3, 3], [2, 2], [1, 1], [np.nan, np.nan]], dims=["x", "y"], coords={"y": [1, 2], "x": [-3, -2, -1, 10]}, ) } ) expected_y2 = Dataset( { "bar": DataArray( [[2, 4], [np.nan, np.nan], [np.nan, np.nan], [1, 3]], dims=["x", "y"], coords={"y": [1, 2], "x": [-3, -2, -1, 10]}, ) } ) assert_identical(expected_x2, x2) assert_identical(expected_y2, y2) def test_broadcast_multi_index(self) -> None: # GH6430 ds = Dataset( {"foo": (("x", "y", "z"), np.ones((3, 4, 2)))}, {"x": ["a", "b", "c"], "y": [1, 2, 3, 4]}, ) stacked = ds.stack(space=["x", "y"]) broadcasted, _ = broadcast(stacked, stacked.space) assert broadcasted.xindexes["x"] is broadcasted.xindexes["space"] assert broadcasted.xindexes["y"] is broadcasted.xindexes["space"] def test_variable_indexing(self) -> None: data = create_test_data() v = data["var1"] d1 = data["dim1"] d2 = data["dim2"] assert_equal(v, v[d1.values]) assert_equal(v, v[d1]) assert_equal(v[:3], v[d1 < 3]) assert_equal(v[:, 3:], v[:, d2 >= 1.5]) assert_equal(v[:3, 3:], v[d1 < 3, d2 >= 1.5]) assert_equal(v[:3, :2], v[range(3), range(2)]) assert_equal(v[:3, :2], v.loc[d1[:3], d2[:2]]) def test_drop_variables(self) -> None: data = create_test_data() assert_identical(data, data.drop_vars([])) expected = Dataset({k: data[k] for k in data.variables if k != "time"}) actual = data.drop_vars("time") assert_identical(expected, actual) actual = data.drop_vars(["time"]) assert_identical(expected, actual) with pytest.raises( ValueError, match=re.escape( "These variables cannot be found in this dataset: ['not_found_here']" ), ): data.drop_vars("not_found_here") actual = data.drop_vars("not_found_here", errors="ignore") assert_identical(data, actual) actual = data.drop_vars(["not_found_here"], errors="ignore") assert_identical(data, actual) actual = data.drop_vars(["time", "not_found_here"], errors="ignore") assert_identical(expected, actual) # deprecated approach with `drop` works (straight copy paste from above) with pytest.warns(DeprecationWarning): actual = data.drop("not_found_here", errors="ignore") assert_identical(data, actual) with pytest.warns(DeprecationWarning): actual = data.drop(["not_found_here"], errors="ignore") assert_identical(data, actual) with pytest.warns(DeprecationWarning): actual = data.drop(["time", "not_found_here"], errors="ignore") assert_identical(expected, actual) with pytest.warns(DeprecationWarning): actual = data.drop({"time", "not_found_here"}, errors="ignore") assert_identical(expected, actual) def test_drop_multiindex_level(self) -> None: data = create_test_multiindex() expected = data.drop_vars(["x", "level_1", "level_2"]) with pytest.warns(DeprecationWarning): actual = data.drop_vars("level_1") assert_identical(expected, actual) def test_drop_index_labels(self) -> None: data = Dataset({"A": (["x", "y"], np.random.randn(2, 3)), "x": ["a", "b"]}) with pytest.warns(DeprecationWarning): actual = data.drop(["a"], dim="x") expected = data.isel(x=[1]) assert_identical(expected, actual) with pytest.warns(DeprecationWarning): actual = data.drop(["a", "b"], dim="x") expected = data.isel(x=slice(0, 0)) assert_identical(expected, actual) with pytest.raises(KeyError): # not contained in axis with pytest.warns(DeprecationWarning): data.drop(["c"], dim="x") with pytest.warns(DeprecationWarning): actual = data.drop(["c"], dim="x", errors="ignore") assert_identical(data, actual) with pytest.raises(ValueError): data.drop(["c"], dim="x", errors="wrong_value") # type: ignore[arg-type] with pytest.warns(DeprecationWarning): actual = data.drop(["a", "b", "c"], "x", errors="ignore") expected = data.isel(x=slice(0, 0)) assert_identical(expected, actual) # DataArrays as labels are a nasty corner case as they are not # Iterable[Hashable] - DataArray.__iter__ yields scalar DataArrays. actual = data.drop_sel(x=DataArray(["a", "b", "c"]), errors="ignore") expected = data.isel(x=slice(0, 0)) assert_identical(expected, actual) with pytest.warns(DeprecationWarning): data.drop(DataArray(["a", "b", "c"]), dim="x", errors="ignore") assert_identical(expected, actual) actual = data.drop_sel(y=[1]) expected = data.isel(y=[0, 2]) assert_identical(expected, actual) with pytest.raises(KeyError, match=r"not found in axis"): data.drop_sel(x=0) def test_drop_labels_by_keyword(self) -> None: data = Dataset( {"A": (["x", "y"], np.random.randn(2, 6)), "x": ["a", "b"], "y": range(6)} ) # Basic functionality. assert len(data.coords["x"]) == 2 with pytest.warns(DeprecationWarning): ds1 = data.drop(["a"], dim="x") ds2 = data.drop_sel(x="a") ds3 = data.drop_sel(x=["a"]) ds4 = data.drop_sel(x=["a", "b"]) ds5 = data.drop_sel(x=["a", "b"], y=range(0, 6, 2)) arr = DataArray(range(3), dims=["c"]) with pytest.warns(DeprecationWarning): data.drop(arr.coords) with pytest.warns(DeprecationWarning): data.drop(arr.xindexes) assert_array_equal(ds1.coords["x"], ["b"]) assert_array_equal(ds2.coords["x"], ["b"]) assert_array_equal(ds3.coords["x"], ["b"]) assert ds4.coords["x"].size == 0 assert ds5.coords["x"].size == 0 assert_array_equal(ds5.coords["y"], [1, 3, 5]) # Error handling if user tries both approaches. with pytest.raises(ValueError): data.drop(labels=["a"], x="a") with pytest.raises(ValueError): data.drop(labels=["a"], dim="x", x="a") warnings.filterwarnings("ignore", r"\W*drop") with pytest.raises(ValueError): data.drop(dim="x", x="a") def test_drop_labels_by_position(self) -> None: data = Dataset( {"A": (["x", "y"], np.random.randn(2, 6)), "x": ["a", "b"], "y": range(6)} ) # Basic functionality. assert len(data.coords["x"]) == 2 actual = data.drop_isel(x=0) expected = data.drop_sel(x="a") assert_identical(expected, actual) actual = data.drop_isel(x=[0]) expected = data.drop_sel(x=["a"]) assert_identical(expected, actual) actual = data.drop_isel(x=[0, 1]) expected = data.drop_sel(x=["a", "b"]) assert_identical(expected, actual) assert actual.coords["x"].size == 0 actual = data.drop_isel(x=[0, 1], y=range(0, 6, 2)) expected = data.drop_sel(x=["a", "b"], y=range(0, 6, 2)) assert_identical(expected, actual) assert actual.coords["x"].size == 0 with pytest.raises(KeyError): data.drop_isel(z=1) def test_drop_indexes(self) -> None: ds = Dataset( coords={ "x": ("x", [0, 1, 2]), "y": ("y", [3, 4, 5]), "foo": ("x", ["a", "a", "b"]), } ) actual = ds.drop_indexes("x") assert "x" not in actual.xindexes assert type(actual.x.variable) is Variable actual = ds.drop_indexes(["x", "y"]) assert "x" not in actual.xindexes assert "y" not in actual.xindexes assert type(actual.x.variable) is Variable assert type(actual.y.variable) is Variable with pytest.raises( ValueError, match=r"The coordinates \('not_a_coord',\) are not found in the dataset coordinates", ): ds.drop_indexes("not_a_coord") with pytest.raises(ValueError, match="those coordinates do not have an index"): ds.drop_indexes("foo") actual = ds.drop_indexes(["foo", "not_a_coord"], errors="ignore") assert_identical(actual, ds) # test index corrupted midx = pd.MultiIndex.from_tuples([([1, 2]), ([3, 4])], names=["a", "b"]) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") ds = Dataset(coords=midx_coords) with pytest.raises(ValueError, match=".*would corrupt the following index.*"): ds.drop_indexes("a") def test_drop_dims(self) -> None: data = xr.Dataset( { "A": (["x", "y"], np.random.randn(2, 3)), "B": ("x", np.random.randn(2)), "x": ["a", "b"], "z": np.pi, } ) actual = data.drop_dims("x") expected = data.drop_vars(["A", "B", "x"]) assert_identical(expected, actual) actual = data.drop_dims("y") expected = data.drop_vars("A") assert_identical(expected, actual) actual = data.drop_dims(["x", "y"]) expected = data.drop_vars(["A", "B", "x"]) assert_identical(expected, actual) with pytest.raises((ValueError, KeyError)): data.drop_dims("z") # not a dimension with pytest.raises((ValueError, KeyError)): data.drop_dims(None) # type:ignore[arg-type] actual = data.drop_dims("z", errors="ignore") assert_identical(data, actual) # should this be allowed? actual = data.drop_dims(None, errors="ignore") # type:ignore[arg-type] assert_identical(data, actual) with pytest.raises(ValueError): actual = data.drop_dims("z", errors="wrong_value") # type: ignore[arg-type] actual = data.drop_dims(["x", "y", "z"], errors="ignore") expected = data.drop_vars(["A", "B", "x"]) assert_identical(expected, actual) def test_copy(self) -> None: data = create_test_data() data.attrs["Test"] = [1, 2, 3] for copied in [data.copy(deep=False), copy(data)]: assert_identical(data, copied) assert data.encoding == copied.encoding # Note: IndexVariable objects with string dtype are always # copied because of xarray.core.indexes.safe_cast_to_index. # Limiting the test to data variables. for k in data.data_vars: v0 = data.variables[k] v1 = copied.variables[k] assert source_ndarray(v0.data) is source_ndarray(v1.data) copied["foo"] = ("z", np.arange(5)) assert "foo" not in data copied.attrs["foo"] = "bar" assert "foo" not in data.attrs assert data.attrs["Test"] is copied.attrs["Test"] for copied in [data.copy(deep=True), deepcopy(data)]: assert_identical(data, copied) for k, v0 in data.variables.items(): v1 = copied.variables[k] assert v0 is not v1 assert data.attrs["Test"] is not copied.attrs["Test"] def test_copy_with_data(self) -> None: orig = create_test_data() new_data = {k: np.random.randn(*v.shape) for k, v in orig.data_vars.items()} actual = orig.copy(data=new_data) expected = orig.copy() for k, v in new_data.items(): expected[k].data = v assert_identical(expected, actual) @pytest.mark.xfail(raises=AssertionError) @pytest.mark.parametrize( "deep, expected_orig", [ [ True, xr.DataArray( xr.IndexVariable("a", np.array([1, 2])), coords={"a": [1, 2]}, dims=["a"], ), ], [ False, xr.DataArray( xr.IndexVariable("a", np.array([999, 2])), coords={"a": [999, 2]}, dims=["a"], ), ], ], ) def test_copy_coords(self, deep, expected_orig) -> None: """The test fails for the shallow copy, and apparently only on Windows for some reason. In windows coords seem to be immutable unless it's one dataset deep copied from another.""" ds = xr.DataArray( np.ones([2, 2, 2]), coords={"a": [1, 2], "b": ["x", "y"], "c": [0, 1]}, dims=["a", "b", "c"], name="value", ).to_dataset() ds_cp = ds.copy(deep=deep) new_a = np.array([999, 2]) ds_cp.coords["a"] = ds_cp.a.copy(data=new_a) expected_cp = xr.DataArray( xr.IndexVariable("a", new_a), coords={"a": [999, 2]}, dims=["a"], ) assert_identical(ds_cp.coords["a"], expected_cp) assert_identical(ds.coords["a"], expected_orig) def test_copy_with_data_errors(self) -> None: orig = create_test_data() new_var1 = np.arange(orig["var1"].size).reshape(orig["var1"].shape) with pytest.raises(ValueError, match=r"Data must be dict-like"): orig.copy(data=new_var1) # type: ignore[arg-type] with pytest.raises(ValueError, match=r"only contain variables in original"): orig.copy(data={"not_in_original": new_var1}) with pytest.raises(ValueError, match=r"contain all variables in original"): orig.copy(data={"var1": new_var1}) def test_drop_encoding(self) -> None: orig = create_test_data() vencoding = {"scale_factor": 10} orig.encoding = {"foo": "bar"} for k in orig.variables.keys(): orig[k].encoding = vencoding actual = orig.drop_encoding() assert actual.encoding == {} for v in actual.variables.values(): assert v.encoding == {} assert_equal(actual, orig) def test_rename(self) -> None: data = create_test_data() newnames = { "var1": "renamed_var1", "dim2": "renamed_dim2", } renamed = data.rename(newnames) variables = dict(data.variables) for nk, nv in newnames.items(): variables[nv] = variables.pop(nk) for k, v in variables.items(): dims = list(v.dims) for name, newname in newnames.items(): if name in dims: dims[dims.index(name)] = newname assert_equal( Variable(dims, v.values, v.attrs), renamed[k].variable.to_base_variable(), ) assert v.encoding == renamed[k].encoding assert type(v) is type(renamed.variables[k]) assert "var1" not in renamed assert "dim2" not in renamed with pytest.raises(ValueError, match=r"cannot rename 'not_a_var'"): data.rename({"not_a_var": "nada"}) with pytest.raises(ValueError, match=r"'var1' conflicts"): data.rename({"var2": "var1"}) # verify that we can rename a variable without accessing the data var1 = data["var1"] data["var1"] = (var1.dims, InaccessibleArray(var1.values)) renamed = data.rename(newnames) with pytest.raises(UnexpectedDataAccess): _ = renamed["renamed_var1"].values # https://github.com/python/mypy/issues/10008 renamed_kwargs = data.rename(**newnames) # type: ignore[arg-type] assert_identical(renamed, renamed_kwargs) def test_rename_old_name(self) -> None: # regtest for GH1477 data = create_test_data() with pytest.raises(ValueError, match=r"'samecol' conflicts"): data.rename({"var1": "samecol", "var2": "samecol"}) # This shouldn't cause any problems. data.rename({"var1": "var2", "var2": "var1"}) def test_rename_same_name(self) -> None: data = create_test_data() newnames = {"var1": "var1", "dim2": "dim2"} renamed = data.rename(newnames) assert_identical(renamed, data) def test_rename_dims(self) -> None: original = Dataset({"x": ("x", [0, 1, 2]), "y": ("x", [10, 11, 12]), "z": 42}) expected = Dataset( {"x": ("x_new", [0, 1, 2]), "y": ("x_new", [10, 11, 12]), "z": 42} ) # TODO: (benbovy - explicit indexes) update when set_index supports # setting index for non-dimension variables expected = expected.set_coords("x") actual = original.rename_dims({"x": "x_new"}) assert_identical(expected, actual, check_default_indexes=False) actual_2 = original.rename_dims(x="x_new") assert_identical(expected, actual_2, check_default_indexes=False) # Test to raise ValueError dims_dict_bad = {"x_bad": "x_new"} with pytest.raises(ValueError): original.rename_dims(dims_dict_bad) with pytest.raises(ValueError): original.rename_dims({"x": "z"}) def test_rename_vars(self) -> None: original = Dataset({"x": ("x", [0, 1, 2]), "y": ("x", [10, 11, 12]), "z": 42}) expected = Dataset( {"x_new": ("x", [0, 1, 2]), "y": ("x", [10, 11, 12]), "z": 42} ) # TODO: (benbovy - explicit indexes) update when set_index supports # setting index for non-dimension variables expected = expected.set_coords("x_new") actual = original.rename_vars({"x": "x_new"}) assert_identical(expected, actual, check_default_indexes=False) actual_2 = original.rename_vars(x="x_new") assert_identical(expected, actual_2, check_default_indexes=False) # Test to raise ValueError names_dict_bad = {"x_bad": "x_new"} with pytest.raises(ValueError): original.rename_vars(names_dict_bad) def test_rename_dimension_coord(self) -> None: # rename a dimension corodinate to a non-dimension coordinate # should preserve index original = Dataset(coords={"x": ("x", [0, 1, 2])}) actual = original.rename_vars({"x": "x_new"}) assert "x_new" in actual.xindexes actual_2 = original.rename_dims({"x": "x_new"}) assert "x" in actual_2.xindexes def test_rename_dimension_coord_warnings(self) -> None: # create a dimension coordinate by renaming a dimension or coordinate # should raise a warning (no index created) ds = Dataset(coords={"x": ("y", [0, 1])}) with pytest.warns( UserWarning, match="rename 'x' to 'y' does not create an index.*" ): ds.rename(x="y") ds = Dataset(coords={"y": ("x", [0, 1])}) with pytest.warns( UserWarning, match="rename 'x' to 'y' does not create an index.*" ): ds.rename(x="y") # No operation should not raise a warning ds = Dataset( data_vars={"data": (("x", "y"), np.ones((2, 3)))}, coords={"x": range(2), "y": range(3), "a": ("x", [3, 4])}, ) with warnings.catch_warnings(): warnings.simplefilter("error") ds.rename(x="x") def test_rename_multiindex(self) -> None: midx = pd.MultiIndex.from_tuples([([1, 2]), ([3, 4])], names=["a", "b"]) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") original = Dataset({}, midx_coords) midx_renamed = midx.rename(["a", "c"]) midx_coords_renamed = Coordinates.from_pandas_multiindex(midx_renamed, "x") expected = Dataset({}, midx_coords_renamed) actual = original.rename({"b": "c"}) assert_identical(expected, actual) with pytest.raises(ValueError, match=r"'a' conflicts"): with pytest.warns(UserWarning, match="does not create an index anymore"): original.rename({"x": "a"}) with pytest.raises(ValueError, match=r"'x' conflicts"): with pytest.warns(UserWarning, match="does not create an index anymore"): original.rename({"a": "x"}) with pytest.raises(ValueError, match=r"'b' conflicts"): original.rename({"a": "b"}) def test_rename_preserve_attrs_encoding(self) -> None: # test propagate attrs/encoding to new variable(s) created from Index object original = Dataset(coords={"x": ("x", [0, 1, 2])}) expected = Dataset(coords={"y": ("y", [0, 1, 2])}) for ds, dim in zip([original, expected], ["x", "y"], strict=True): ds[dim].attrs = {"foo": "bar"} ds[dim].encoding = {"foo": "bar"} actual = original.rename({"x": "y"}) assert_identical(actual, expected) @requires_cftime def test_rename_does_not_change_CFTimeIndex_type(self) -> None: # make sure CFTimeIndex is not converted to DatetimeIndex #3522 time = xr.cftime_range(start="2000", periods=6, freq="2MS", calendar="noleap") orig = Dataset(coords={"time": time}) renamed = orig.rename(time="time_new") assert "time_new" in renamed.xindexes # TODO: benbovy - flexible indexes: update when CFTimeIndex # inherits from xarray.Index assert isinstance(renamed.xindexes["time_new"].to_pandas_index(), CFTimeIndex) assert renamed.xindexes["time_new"].to_pandas_index().name == "time_new" # check original has not changed assert "time" in orig.xindexes assert isinstance(orig.xindexes["time"].to_pandas_index(), CFTimeIndex) assert orig.xindexes["time"].to_pandas_index().name == "time" # note: rename_dims(time="time_new") drops "ds.indexes" renamed = orig.rename_dims() assert isinstance(renamed.xindexes["time"].to_pandas_index(), CFTimeIndex) renamed = orig.rename_vars() assert isinstance(renamed.xindexes["time"].to_pandas_index(), CFTimeIndex) def test_rename_does_not_change_DatetimeIndex_type(self) -> None: # make sure DatetimeIndex is conderved on rename time = pd.date_range(start="2000", periods=6, freq="2MS") orig = Dataset(coords={"time": time}) renamed = orig.rename(time="time_new") assert "time_new" in renamed.xindexes # TODO: benbovy - flexible indexes: update when DatetimeIndex # inherits from xarray.Index? assert isinstance(renamed.xindexes["time_new"].to_pandas_index(), DatetimeIndex) assert renamed.xindexes["time_new"].to_pandas_index().name == "time_new" # check original has not changed assert "time" in orig.xindexes assert isinstance(orig.xindexes["time"].to_pandas_index(), DatetimeIndex) assert orig.xindexes["time"].to_pandas_index().name == "time" # note: rename_dims(time="time_new") drops "ds.indexes" renamed = orig.rename_dims() assert isinstance(renamed.xindexes["time"].to_pandas_index(), DatetimeIndex) renamed = orig.rename_vars() assert isinstance(renamed.xindexes["time"].to_pandas_index(), DatetimeIndex) def test_swap_dims(self) -> None: original = Dataset({"x": [1, 2, 3], "y": ("x", list("abc")), "z": 42}) expected = Dataset({"z": 42}, {"x": ("y", [1, 2, 3]), "y": list("abc")}) actual = original.swap_dims({"x": "y"}) assert_identical(expected, actual) assert isinstance(actual.variables["y"], IndexVariable) assert isinstance(actual.variables["x"], Variable) assert actual.xindexes["y"].equals(expected.xindexes["y"]) roundtripped = actual.swap_dims({"y": "x"}) assert_identical(original.set_coords("y"), roundtripped) with pytest.raises(ValueError, match=r"cannot swap"): original.swap_dims({"y": "x"}) with pytest.raises(ValueError, match=r"replacement dimension"): original.swap_dims({"x": "z"}) expected = Dataset( {"y": ("u", list("abc")), "z": 42}, coords={"x": ("u", [1, 2, 3])} ) actual = original.swap_dims({"x": "u"}) assert_identical(expected, actual) # as kwargs expected = Dataset( {"y": ("u", list("abc")), "z": 42}, coords={"x": ("u", [1, 2, 3])} ) actual = original.swap_dims(x="u") assert_identical(expected, actual) # handle multiindex case midx = pd.MultiIndex.from_arrays([list("aab"), list("yzz")], names=["y1", "y2"]) original = Dataset({"x": [1, 2, 3], "y": ("x", midx), "z": 42}) midx_coords = Coordinates.from_pandas_multiindex(midx, "y") midx_coords["x"] = ("y", [1, 2, 3]) expected = Dataset({"z": 42}, midx_coords) actual = original.swap_dims({"x": "y"}) assert_identical(expected, actual) assert isinstance(actual.variables["y"], IndexVariable) assert isinstance(actual.variables["x"], Variable) assert actual.xindexes["y"].equals(expected.xindexes["y"]) def test_expand_dims_error(self) -> None: original = Dataset( { "x": ("a", np.random.randn(3)), "y": (["b", "a"], np.random.randn(4, 3)), "z": ("a", np.random.randn(3)), }, coords={ "a": np.linspace(0, 1, 3), "b": np.linspace(0, 1, 4), "c": np.linspace(0, 1, 5), }, attrs={"key": "entry"}, ) with pytest.raises(ValueError, match=r"already exists"): original.expand_dims(dim=["x"]) # Make sure it raises true error also for non-dimensional coordinates # which has dimension. original = original.set_coords("z") with pytest.raises(ValueError, match=r"already exists"): original.expand_dims(dim=["z"]) original = Dataset( { "x": ("a", np.random.randn(3)), "y": (["b", "a"], np.random.randn(4, 3)), "z": ("a", np.random.randn(3)), }, coords={ "a": np.linspace(0, 1, 3), "b": np.linspace(0, 1, 4), "c": np.linspace(0, 1, 5), }, attrs={"key": "entry"}, ) with pytest.raises(TypeError, match=r"value of new dimension"): original.expand_dims({"d": 3.2}) with pytest.raises(ValueError, match=r"both keyword and positional"): original.expand_dims({"d": 4}, e=4) def test_expand_dims_int(self) -> None: original = Dataset( {"x": ("a", np.random.randn(3)), "y": (["b", "a"], np.random.randn(4, 3))}, coords={ "a": np.linspace(0, 1, 3), "b": np.linspace(0, 1, 4), "c": np.linspace(0, 1, 5), }, attrs={"key": "entry"}, ) actual = original.expand_dims(["z"], [1]) expected = Dataset( { "x": original["x"].expand_dims("z", 1), "y": original["y"].expand_dims("z", 1), }, coords={ "a": np.linspace(0, 1, 3), "b": np.linspace(0, 1, 4), "c": np.linspace(0, 1, 5), }, attrs={"key": "entry"}, ) assert_identical(expected, actual) # make sure squeeze restores the original data set. roundtripped = actual.squeeze("z") assert_identical(original, roundtripped) # another test with a negative axis actual = original.expand_dims(["z"], [-1]) expected = Dataset( { "x": original["x"].expand_dims("z", -1), "y": original["y"].expand_dims("z", -1), }, coords={ "a": np.linspace(0, 1, 3), "b": np.linspace(0, 1, 4), "c": np.linspace(0, 1, 5), }, attrs={"key": "entry"}, ) assert_identical(expected, actual) # make sure squeeze restores the original data set. roundtripped = actual.squeeze("z") assert_identical(original, roundtripped) def test_expand_dims_coords(self) -> None: original = Dataset({"x": ("a", np.array([1, 2, 3]))}) expected = Dataset( {"x": (("b", "a"), np.array([[1, 2, 3], [1, 2, 3]]))}, coords={"b": [1, 2]} ) actual = original.expand_dims(dict(b=[1, 2])) assert_identical(expected, actual) assert "b" not in original._coord_names def test_expand_dims_existing_scalar_coord(self) -> None: original = Dataset({"x": 1}, {"a": 2}) expected = Dataset({"x": (("a",), [1])}, {"a": [2]}) actual = original.expand_dims("a") assert_identical(expected, actual) def test_isel_expand_dims_roundtrip(self) -> None: original = Dataset({"x": (("a",), [1])}, {"a": [2]}) actual = original.isel(a=0).expand_dims("a") assert_identical(actual, original) def test_expand_dims_mixed_int_and_coords(self) -> None: # Test expanding one dimension to have size > 1 that doesn't have # coordinates, and also expanding another dimension to have size > 1 # that DOES have coordinates. original = Dataset( {"x": ("a", np.random.randn(3)), "y": (["b", "a"], np.random.randn(4, 3))}, coords={ "a": np.linspace(0, 1, 3), "b": np.linspace(0, 1, 4), "c": np.linspace(0, 1, 5), }, ) actual = original.expand_dims({"d": 4, "e": ["l", "m", "n"]}) expected = Dataset( { "x": xr.DataArray( original["x"].values * np.ones([4, 3, 3]), coords=dict(d=range(4), e=["l", "m", "n"], a=np.linspace(0, 1, 3)), dims=["d", "e", "a"], ).drop_vars("d"), "y": xr.DataArray( original["y"].values * np.ones([4, 3, 4, 3]), coords=dict( d=range(4), e=["l", "m", "n"], b=np.linspace(0, 1, 4), a=np.linspace(0, 1, 3), ), dims=["d", "e", "b", "a"], ).drop_vars("d"), }, coords={"c": np.linspace(0, 1, 5)}, ) assert_identical(actual, expected) def test_expand_dims_kwargs_python36plus(self) -> None: original = Dataset( {"x": ("a", np.random.randn(3)), "y": (["b", "a"], np.random.randn(4, 3))}, coords={ "a": np.linspace(0, 1, 3), "b": np.linspace(0, 1, 4), "c": np.linspace(0, 1, 5), }, attrs={"key": "entry"}, ) other_way = original.expand_dims(e=["l", "m", "n"]) other_way_expected = Dataset( { "x": xr.DataArray( original["x"].values * np.ones([3, 3]), coords=dict(e=["l", "m", "n"], a=np.linspace(0, 1, 3)), dims=["e", "a"], ), "y": xr.DataArray( original["y"].values * np.ones([3, 4, 3]), coords=dict( e=["l", "m", "n"], b=np.linspace(0, 1, 4), a=np.linspace(0, 1, 3), ), dims=["e", "b", "a"], ), }, coords={"c": np.linspace(0, 1, 5)}, attrs={"key": "entry"}, ) assert_identical(other_way_expected, other_way) @pytest.mark.parametrize("create_index_for_new_dim_flag", [True, False]) def test_expand_dims_create_index_data_variable( self, create_index_for_new_dim_flag ): # data variables should not gain an index ever ds = Dataset({"x": 0}) if create_index_for_new_dim_flag: with pytest.warns(UserWarning, match="No index created"): expanded = ds.expand_dims( "x", create_index_for_new_dim=create_index_for_new_dim_flag ) else: expanded = ds.expand_dims( "x", create_index_for_new_dim=create_index_for_new_dim_flag ) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") assert_identical(expanded, expected, check_default_indexes=False) assert expanded.indexes == {} def test_expand_dims_create_index_coordinate_variable(self): # coordinate variables should gain an index only if create_index_for_new_dim is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) expanded_no_index = ds.expand_dims("x", create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") assert_identical(expanded_no_index, expected, check_default_indexes=False) assert expanded_no_index.indexes == {} def test_expand_dims_create_index_from_iterable(self): ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims(x=[0, 1]) expected = Dataset({"x": ("x", [0, 1])}) assert_identical(expanded, expected) expanded_no_index = ds.expand_dims(x=[0, 1], create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0, 1])}).drop_indexes("x") assert_identical(expanded, expected, check_default_indexes=False) assert expanded_no_index.indexes == {} def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 # todo: test still needed? ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "m")]}) assert ds.time.dtype == np.dtype("datetime64[s]") def test_set_index(self) -> None: expected = create_test_multiindex() mindex = expected["x"].to_index() indexes = [mindex.get_level_values(n) for n in mindex.names] coords = {idx.name: ("x", idx) for idx in indexes} ds = Dataset({}, coords=coords) obj = ds.set_index(x=mindex.names) assert_identical(obj, expected) # ensure pre-existing indexes involved are removed # (level_2 should be a coordinate with no index) ds = create_test_multiindex() coords = {"x": coords["level_1"], "level_2": coords["level_2"]} expected = Dataset({}, coords=coords) obj = ds.set_index(x="level_1") assert_identical(obj, expected) # ensure set_index with no existing index and a single data var given # doesn't return multi-index ds = Dataset(data_vars={"x_var": ("x", [0, 1, 2])}) expected = Dataset(coords={"x": [0, 1, 2]}) assert_identical(ds.set_index(x="x_var"), expected) with pytest.raises(ValueError, match=r"bar variable\(s\) do not exist"): ds.set_index(foo="bar") with pytest.raises(ValueError, match=r"dimension mismatch.*"): ds.set_index(y="x_var") ds = Dataset(coords={"x": 1}) with pytest.raises( ValueError, match=r".*cannot set a PandasIndex.*scalar variable.*" ): ds.set_index(x="x") def test_set_index_deindexed_coords(self) -> None: # test de-indexed coordinates are converted to base variable # https://github.com/pydata/xarray/issues/6969 one = ["a", "a", "b", "b"] two = [1, 2, 1, 2] three = ["c", "c", "d", "d"] four = [3, 4, 3, 4] midx_12 = pd.MultiIndex.from_arrays([one, two], names=["one", "two"]) midx_34 = pd.MultiIndex.from_arrays([three, four], names=["three", "four"]) coords = Coordinates.from_pandas_multiindex(midx_12, "x") coords["three"] = ("x", three) coords["four"] = ("x", four) ds = xr.Dataset(coords=coords) actual = ds.set_index(x=["three", "four"]) coords_expected = Coordinates.from_pandas_multiindex(midx_34, "x") coords_expected["one"] = ("x", one) coords_expected["two"] = ("x", two) expected = xr.Dataset(coords=coords_expected) assert_identical(actual, expected) def test_reset_index(self) -> None: ds = create_test_multiindex() mindex = ds["x"].to_index() indexes = [mindex.get_level_values(n) for n in mindex.names] coords = {idx.name: ("x", idx) for idx in indexes} expected = Dataset({}, coords=coords) obj = ds.reset_index("x") assert_identical(obj, expected, check_default_indexes=False) assert len(obj.xindexes) == 0 ds = Dataset(coords={"y": ("x", [1, 2, 3])}) with pytest.raises(ValueError, match=r".*not coordinates with an index"): ds.reset_index("y") def test_reset_index_keep_attrs(self) -> None: coord_1 = DataArray([1, 2], dims=["coord_1"], attrs={"attrs": True}) ds = Dataset({}, {"coord_1": coord_1}) obj = ds.reset_index("coord_1") assert ds.coord_1.attrs == obj.coord_1.attrs assert len(obj.xindexes) == 0 def test_reset_index_drop_dims(self) -> None: ds = Dataset(coords={"x": [1, 2]}) reset = ds.reset_index("x", drop=True) assert len(reset.dims) == 0 @pytest.mark.parametrize( ["arg", "drop", "dropped", "converted", "renamed"], [ ("foo", False, [], [], {"bar": "x"}), ("foo", True, ["foo"], [], {"bar": "x"}), ("x", False, ["x"], ["foo", "bar"], {}), ("x", True, ["x", "foo", "bar"], [], {}), (["foo", "bar"], False, ["x"], ["foo", "bar"], {}), (["foo", "bar"], True, ["x", "foo", "bar"], [], {}), (["x", "foo"], False, ["x"], ["foo", "bar"], {}), (["foo", "x"], True, ["x", "foo", "bar"], [], {}), ], ) def test_reset_index_drop_convert( self, arg: str | list[str], drop: bool, dropped: list[str], converted: list[str], renamed: dict[str, str], ) -> None: # regressions https://github.com/pydata/xarray/issues/6946 and # https://github.com/pydata/xarray/issues/6989 # check that multi-index dimension or level coordinates are dropped, converted # from IndexVariable to Variable or renamed to dimension as expected midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("foo", "bar")) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") ds = xr.Dataset(coords=midx_coords) reset = ds.reset_index(arg, drop=drop) for name in dropped: assert name not in reset.variables for name in converted: assert_identical(reset[name].variable, ds[name].variable.to_base_variable()) for old_name, new_name in renamed.items(): assert_identical(ds[old_name].variable, reset[new_name].variable) def test_reorder_levels(self) -> None: ds = create_test_multiindex() mindex = ds["x"].to_index() assert isinstance(mindex, pd.MultiIndex) midx = mindex.reorder_levels(["level_2", "level_1"]) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") expected = Dataset({}, coords=midx_coords) # check attrs propagated ds["level_1"].attrs["foo"] = "bar" expected["level_1"].attrs["foo"] = "bar" reindexed = ds.reorder_levels(x=["level_2", "level_1"]) assert_identical(reindexed, expected) ds = Dataset({}, coords={"x": [1, 2]}) with pytest.raises(ValueError, match=r"has no MultiIndex"): ds.reorder_levels(x=["level_1", "level_2"]) def test_set_xindex(self) -> None: ds = Dataset( coords={"foo": ("x", ["a", "a", "b", "b"]), "bar": ("x", [0, 1, 2, 3])} ) actual = ds.set_xindex("foo") expected = ds.set_index(x="foo").rename_vars(x="foo") assert_identical(actual, expected, check_default_indexes=False) actual_mindex = ds.set_xindex(["foo", "bar"]) expected_mindex = ds.set_index(x=["foo", "bar"]) assert_identical(actual_mindex, expected_mindex) class NotAnIndex: ... with pytest.raises(TypeError, match=".*not a subclass of xarray.Index"): ds.set_xindex("foo", NotAnIndex) # type: ignore[arg-type] with pytest.raises(ValueError, match="those variables don't exist"): ds.set_xindex("not_a_coordinate", PandasIndex) ds["data_var"] = ("x", [1, 2, 3, 4]) with pytest.raises(ValueError, match="those variables are data variables"): ds.set_xindex("data_var", PandasIndex) ds2 = Dataset(coords={"x": ("x", [0, 1, 2, 3])}) with pytest.raises(ValueError, match="those coordinates already have an index"): ds2.set_xindex("x", PandasIndex) def test_set_xindex_options(self) -> None: ds = Dataset(coords={"foo": ("x", ["a", "a", "b", "b"])}) class IndexWithOptions(Index): def __init__(self, opt): self.opt = opt @classmethod def from_variables(cls, variables, options): return cls(options["opt"]) indexed = ds.set_xindex("foo", IndexWithOptions, opt=1) assert indexed.xindexes["foo"].opt == 1 # type: ignore[attr-defined] def test_stack(self) -> None: ds = Dataset( data_vars={"b": (("x", "y"), [[0, 1], [2, 3]])}, coords={"x": ("x", [0, 1]), "y": ["a", "b"]}, ) midx_expected = pd.MultiIndex.from_product( [[0, 1], ["a", "b"]], names=["x", "y"] ) midx_coords_expected = Coordinates.from_pandas_multiindex(midx_expected, "z") expected = Dataset( data_vars={"b": ("z", [0, 1, 2, 3])}, coords=midx_coords_expected ) # check attrs propagated ds["x"].attrs["foo"] = "bar" expected["x"].attrs["foo"] = "bar" actual = ds.stack(z=["x", "y"]) assert_identical(expected, actual) assert list(actual.xindexes) == ["z", "x", "y"] actual = ds.stack(z=[...]) assert_identical(expected, actual) # non list dims with ellipsis actual = ds.stack(z=(...,)) assert_identical(expected, actual) # ellipsis with given dim actual = ds.stack(z=[..., "y"]) assert_identical(expected, actual) midx_expected = pd.MultiIndex.from_product( [["a", "b"], [0, 1]], names=["y", "x"] ) midx_coords_expected = Coordinates.from_pandas_multiindex(midx_expected, "z") expected = Dataset( data_vars={"b": ("z", [0, 2, 1, 3])}, coords=midx_coords_expected ) expected["x"].attrs["foo"] = "bar" actual = ds.stack(z=["y", "x"]) assert_identical(expected, actual) assert list(actual.xindexes) == ["z", "y", "x"] @pytest.mark.parametrize( "create_index,expected_keys", [ (True, ["z", "x", "y"]), (False, []), (None, ["z", "x", "y"]), ], ) def test_stack_create_index(self, create_index, expected_keys) -> None: ds = Dataset( data_vars={"b": (("x", "y"), [[0, 1], [2, 3]])}, coords={"x": ("x", [0, 1]), "y": ["a", "b"]}, ) actual = ds.stack(z=["x", "y"], create_index=create_index) assert list(actual.xindexes) == expected_keys # TODO: benbovy (flexible indexes) - test error multiple indexes found # along dimension + create_index=True def test_stack_multi_index(self) -> None: # multi-index on a dimension to stack is discarded too midx = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=("lvl1", "lvl2")) coords = Coordinates.from_pandas_multiindex(midx, "x") coords["y"] = [0, 1] ds = xr.Dataset( data_vars={"b": (("x", "y"), [[0, 1], [2, 3], [4, 5], [6, 7]])}, coords=coords, ) expected = Dataset( data_vars={"b": ("z", [0, 1, 2, 3, 4, 5, 6, 7])}, coords={ "x": ("z", np.repeat(midx.values, 2)), "lvl1": ("z", np.repeat(midx.get_level_values("lvl1"), 2)), "lvl2": ("z", np.repeat(midx.get_level_values("lvl2"), 2)), "y": ("z", [0, 1, 0, 1] * 2), }, ) actual = ds.stack(z=["x", "y"], create_index=False) assert_identical(expected, actual) assert len(actual.xindexes) == 0 with pytest.raises(ValueError, match=r"cannot create.*wraps a multi-index"): ds.stack(z=["x", "y"], create_index=True) def test_stack_non_dim_coords(self) -> None: ds = Dataset( data_vars={"b": (("x", "y"), [[0, 1], [2, 3]])}, coords={"x": ("x", [0, 1]), "y": ["a", "b"]}, ).rename_vars(x="xx") exp_index = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["xx", "y"]) exp_coords = Coordinates.from_pandas_multiindex(exp_index, "z") expected = Dataset(data_vars={"b": ("z", [0, 1, 2, 3])}, coords=exp_coords) actual = ds.stack(z=["x", "y"]) assert_identical(expected, actual) assert list(actual.xindexes) == ["z", "xx", "y"] def test_unstack(self) -> None: index = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["x", "y"]) coords = Coordinates.from_pandas_multiindex(index, "z") ds = Dataset(data_vars={"b": ("z", [0, 1, 2, 3])}, coords=coords) expected = Dataset( {"b": (("x", "y"), [[0, 1], [2, 3]]), "x": [0, 1], "y": ["a", "b"]} ) # check attrs propagated ds["x"].attrs["foo"] = "bar" expected["x"].attrs["foo"] = "bar" for dim in ["z", ["z"], None]: actual = ds.unstack(dim) assert_identical(actual, expected) def test_unstack_errors(self) -> None: ds = Dataset({"x": [1, 2, 3]}) with pytest.raises( ValueError, match=re.escape("Dimensions ('foo',) not found in data dimensions ('x',)"), ): ds.unstack("foo") with pytest.raises(ValueError, match=r".*do not have exactly one multi-index"): ds.unstack("x") ds = Dataset({"da": [1, 2]}, coords={"y": ("x", [1, 1]), "z": ("x", [0, 0])}) ds = ds.set_index(x=("y", "z")) with pytest.raises( ValueError, match="Cannot unstack MultiIndex containing duplicates" ): ds.unstack("x") def test_unstack_fill_value(self) -> None: ds = xr.Dataset( {"var": (("x",), np.arange(6)), "other_var": (("x",), np.arange(3, 9))}, coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, ) # make ds incomplete ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) # test fill_value actual1 = ds.unstack("index", fill_value=-1) expected1 = ds.unstack("index").fillna(-1).astype(int) assert actual1["var"].dtype == int assert_equal(actual1, expected1) actual2 = ds["var"].unstack("index", fill_value=-1) expected2 = ds["var"].unstack("index").fillna(-1).astype(int) assert_equal(actual2, expected2) actual3 = ds.unstack("index", fill_value={"var": -1, "other_var": 1}) expected3 = ds.unstack("index").fillna({"var": -1, "other_var": 1}).astype(int) assert_equal(actual3, expected3) @requires_sparse def test_unstack_sparse(self) -> None: ds = xr.Dataset( {"var": (("x",), np.arange(6))}, coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, ) # make ds incomplete ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) # test fill_value actual1 = ds.unstack("index", sparse=True) expected1 = ds.unstack("index") assert isinstance(actual1["var"].data, sparse_array_type) assert actual1["var"].variable._to_dense().equals(expected1["var"].variable) assert actual1["var"].data.density < 1.0 actual2 = ds["var"].unstack("index", sparse=True) expected2 = ds["var"].unstack("index") assert isinstance(actual2.data, sparse_array_type) assert actual2.variable._to_dense().equals(expected2.variable) assert actual2.data.density < 1.0 midx = pd.MultiIndex.from_arrays([np.arange(3), np.arange(3)], names=["a", "b"]) coords = Coordinates.from_pandas_multiindex(midx, "z") coords["foo"] = np.arange(4) coords["bar"] = np.arange(5) ds_eye = Dataset( {"var": (("z", "foo", "bar"), np.ones((3, 4, 5)))}, coords=coords ) actual3 = ds_eye.unstack(sparse=True, fill_value=0) assert isinstance(actual3["var"].data, sparse_array_type) expected3 = xr.Dataset( { "var": ( ("foo", "bar", "a", "b"), np.broadcast_to(np.eye(3, 3), (4, 5, 3, 3)), ) }, coords={ "foo": np.arange(4), "bar": np.arange(5), "a": np.arange(3), "b": np.arange(3), }, ) actual3["var"].data = actual3["var"].data.todense() assert_equal(expected3, actual3) def test_stack_unstack_fast(self) -> None: ds = Dataset( { "a": ("x", [0, 1]), "b": (("x", "y"), [[0, 1], [2, 3]]), "x": [0, 1], "y": ["a", "b"], } ) actual = ds.stack(z=["x", "y"]).unstack("z") assert actual.broadcast_equals(ds) actual = ds[["b"]].stack(z=["x", "y"]).unstack("z") assert actual.identical(ds[["b"]]) def test_stack_unstack_slow(self) -> None: ds = Dataset( data_vars={ "a": ("x", [0, 1]), "b": (("x", "y"), [[0, 1], [2, 3]]), }, coords={"x": [0, 1], "y": ["a", "b"]}, ) stacked = ds.stack(z=["x", "y"]) actual = stacked.isel(z=slice(None, None, -1)).unstack("z") assert actual.broadcast_equals(ds) stacked = ds[["b"]].stack(z=["x", "y"]) actual = stacked.isel(z=slice(None, None, -1)).unstack("z") assert actual.identical(ds[["b"]]) def test_to_stacked_array_invalid_sample_dims(self) -> None: data = xr.Dataset( data_vars={"a": (("x", "y"), [[0, 1, 2], [3, 4, 5]]), "b": ("x", [6, 7])}, coords={"y": ["u", "v", "w"]}, ) with pytest.raises( ValueError, match=r"Variables in the dataset must contain all ``sample_dims`` \(\['y'\]\) but 'b' misses \['y'\]", ): data.to_stacked_array("features", sample_dims=["y"]) def test_to_stacked_array_name(self) -> None: name = "adf9d" # make a two dimensional dataset a, b = create_test_stacked_array() D = xr.Dataset({"a": a, "b": b}) sample_dims = ["x"] y = D.to_stacked_array("features", sample_dims, name=name) assert y.name == name def test_to_stacked_array_dtype_dims(self) -> None: # make a two dimensional dataset a, b = create_test_stacked_array() D = xr.Dataset({"a": a, "b": b}) sample_dims = ["x"] y = D.to_stacked_array("features", sample_dims) mindex = y.xindexes["features"].to_pandas_index() assert isinstance(mindex, pd.MultiIndex) assert mindex.levels[1].dtype == D.y.dtype assert y.dims == ("x", "features") def test_to_stacked_array_to_unstacked_dataset(self) -> None: # single dimension: regression test for GH4049 arr = xr.DataArray(np.arange(3), coords=[("x", [0, 1, 2])]) data = xr.Dataset({"a": arr, "b": arr}) stacked = data.to_stacked_array("y", sample_dims=["x"]) unstacked = stacked.to_unstacked_dataset("y") assert_identical(unstacked, data) # make a two dimensional dataset a, b = create_test_stacked_array() D = xr.Dataset({"a": a, "b": b}) sample_dims = ["x"] y = D.to_stacked_array("features", sample_dims).transpose("x", "features") x = y.to_unstacked_dataset("features") assert_identical(D, x) # test on just one sample x0 = y[0].to_unstacked_dataset("features") d0 = D.isel(x=0) assert_identical(d0, x0) def test_to_stacked_array_to_unstacked_dataset_different_dimension(self) -> None: # test when variables have different dimensionality a, b = create_test_stacked_array() sample_dims = ["x"] D = xr.Dataset({"a": a, "b": b.isel(y=0)}) y = D.to_stacked_array("features", sample_dims) x = y.to_unstacked_dataset("features") assert_identical(D, x) def test_to_stacked_array_preserves_dtype(self) -> None: # regression test for bug found in https://github.com/pydata/xarray/pull/8872#issuecomment-2081218616 ds = xr.Dataset( data_vars={ "a": (("x", "y"), [[0, 1, 2], [3, 4, 5]]), "b": ("x", [6, 7]), }, coords={"y": ["u", "v", "w"]}, ) stacked = ds.to_stacked_array("z", sample_dims=["x"]) # coordinate created from variables names should be of string dtype data = np.array(["a", "a", "a", "b"], dtype=" None: data = create_test_data(seed=0) expected = data.copy() var2 = Variable("dim1", np.arange(8)) actual = data actual.update({"var2": var2}) expected["var2"] = var2 assert_identical(expected, actual) actual = data.copy() actual.update(data) assert_identical(expected, actual) other = Dataset(attrs={"new": "attr"}) actual = data.copy() actual.update(other) assert_identical(expected, actual) def test_update_overwrite_coords(self) -> None: data = Dataset({"a": ("x", [1, 2])}, {"b": 3}) data.update(Dataset(coords={"b": 4})) expected = Dataset({"a": ("x", [1, 2])}, {"b": 4}) assert_identical(data, expected) data = Dataset({"a": ("x", [1, 2])}, {"b": 3}) data.update(Dataset({"c": 5}, coords={"b": 4})) expected = Dataset({"a": ("x", [1, 2]), "c": 5}, {"b": 4}) assert_identical(data, expected) data = Dataset({"a": ("x", [1, 2])}, {"b": 3}) data.update({"c": DataArray(5, coords={"b": 4})}) expected = Dataset({"a": ("x", [1, 2]), "c": 5}, {"b": 3}) assert_identical(data, expected) def test_update_multiindex_level(self) -> None: data = create_test_multiindex() with pytest.raises( ValueError, match=r"cannot set or update variable.*corrupt.*index " ): data.update({"level_1": range(4)}) def test_update_auto_align(self) -> None: ds = Dataset({"x": ("t", [3, 4])}, {"t": [0, 1]}) expected1 = Dataset( {"x": ("t", [3, 4]), "y": ("t", [np.nan, 5])}, {"t": [0, 1]} ) actual1 = ds.copy() other1 = {"y": ("t", [5]), "t": [1]} with pytest.raises(ValueError, match=r"conflicting sizes"): actual1.update(other1) actual1.update(Dataset(other1)) assert_identical(expected1, actual1) actual2 = ds.copy() other2 = Dataset({"y": ("t", [5]), "t": [100]}) actual2.update(other2) expected2 = Dataset( {"x": ("t", [3, 4]), "y": ("t", [np.nan] * 2)}, {"t": [0, 1]} ) assert_identical(expected2, actual2) def test_getitem(self) -> None: data = create_test_data() assert isinstance(data["var1"], DataArray) assert_equal(data["var1"].variable, data.variables["var1"]) with pytest.raises(KeyError): data["notfound"] with pytest.raises(KeyError): data[["var1", "notfound"]] with pytest.raises( KeyError, match=r"Hint: use a list to select multiple variables, for example `ds\[\['var1', 'var2'\]\]`", ): data["var1", "var2"] actual1 = data[["var1", "var2"]] expected1 = Dataset({"var1": data["var1"], "var2": data["var2"]}) assert_equal(expected1, actual1) actual2 = data["numbers"] expected2 = DataArray( data["numbers"].variable, {"dim3": data["dim3"], "numbers": data["numbers"]}, dims="dim3", name="numbers", ) assert_identical(expected2, actual2) actual3 = data[dict(dim1=0)] expected3 = data.isel(dim1=0) assert_identical(expected3, actual3) def test_getitem_hashable(self) -> None: data = create_test_data() data[(3, 4)] = data["var1"] + 1 expected = data["var1"] + 1 expected.name = (3, 4) assert_identical(expected, data[(3, 4)]) with pytest.raises(KeyError, match=r"('var1', 'var2')"): data[("var1", "var2")] def test_getitem_multiple_dtype(self) -> None: keys = ["foo", 1] dataset = Dataset({key: ("dim0", range(1)) for key in keys}) assert_identical(dataset, dataset[keys]) def test_virtual_variables_default_coords(self) -> None: dataset = Dataset({"foo": ("x", range(10))}) expected1 = DataArray(range(10), dims="x", name="x") actual1 = dataset["x"] assert_identical(expected1, actual1) assert isinstance(actual1.variable, IndexVariable) actual2 = dataset[["x", "foo"]] expected2 = dataset.assign_coords(x=range(10)) assert_identical(expected2, actual2) def test_virtual_variables_time(self) -> None: # access virtual variables data = create_test_data() index = data.variables["time"].to_index() assert isinstance(index, pd.DatetimeIndex) assert_array_equal(data["time.month"].values, index.month) assert_array_equal(data["time.season"].values, "DJF") # test virtual variable math assert_array_equal(data["time.dayofyear"] + 1, 2 + np.arange(20)) assert_array_equal(np.sin(data["time.dayofyear"]), np.sin(1 + np.arange(20))) # ensure they become coordinates expected = Dataset({}, {"dayofyear": data["time.dayofyear"]}) actual = data[["time.dayofyear"]] assert_equal(expected, actual) # non-coordinate variables ds = Dataset({"t": ("x", pd.date_range("2000-01-01", periods=3))}) assert (ds["t.year"] == 2000).all() def test_virtual_variable_same_name(self) -> None: # regression test for GH367 times = pd.date_range("2000-01-01", freq="h", periods=5) data = Dataset({"time": times}) actual = data["time.time"] expected = DataArray(times.time, [("time", times)], name="time") assert_identical(actual, expected) def test_time_season(self) -> None: time = xr.date_range("2000-01-01", periods=12, freq="ME", use_cftime=False) ds = Dataset({"t": time}) seas = ["DJF"] * 2 + ["MAM"] * 3 + ["JJA"] * 3 + ["SON"] * 3 + ["DJF"] assert_array_equal(seas, ds["t.season"]) def test_slice_virtual_variable(self) -> None: data = create_test_data() assert_equal( data["time.dayofyear"][:10].variable, Variable(["time"], 1 + np.arange(10)) ) assert_equal(data["time.dayofyear"][0].variable, Variable([], 1)) def test_setitem(self) -> None: # assign a variable var = Variable(["dim1"], np.random.randn(8)) data1 = create_test_data() data1["A"] = var data2 = data1.copy() data2["A"] = var assert_identical(data1, data2) # assign a dataset array dv = 2 * data2["A"] data1["B"] = dv.variable data2["B"] = dv assert_identical(data1, data2) # can't assign an ND array without dimensions with pytest.raises(ValueError, match=r"without explicit dimension names"): data2["C"] = var.values.reshape(2, 4) # but can assign a 1D array data1["C"] = var.values data2["C"] = ("C", var.values) assert_identical(data1, data2) # can assign a scalar data1["scalar"] = 0 data2["scalar"] = ([], 0) assert_identical(data1, data2) # can't use the same dimension name as a scalar var with pytest.raises(ValueError, match=r"already exists as a scalar"): data1["newvar"] = ("scalar", [3, 4, 5]) # can't resize a used dimension with pytest.raises(ValueError, match=r"conflicting dimension sizes"): data1["dim1"] = data1["dim1"][:5] # override an existing value data1["A"] = 3 * data2["A"] assert_equal(data1["A"], 3 * data2["A"]) # can't assign a dataset to a single key with pytest.raises(TypeError, match="Cannot assign a Dataset to a single key"): data1["D"] = xr.Dataset() # test assignment with positional and label-based indexing data3 = data1[["var1", "var2"]] data3["var3"] = data3.var1.isel(dim1=0) data4 = data3.copy() err_msg = ( "can only set locations defined by dictionaries from Dataset.loc. Got: a" ) with pytest.raises(TypeError, match=err_msg): data1.loc["a"] = 0 err_msg = r"Variables \['A', 'B', 'scalar'\] in new values not available in original dataset:" with pytest.raises(ValueError, match=err_msg): data4[{"dim2": 1}] = data1[{"dim2": 2}] err_msg = "Variable 'var3': indexer {'dim2': 0} not available" with pytest.raises(ValueError, match=err_msg): data1[{"dim2": 0}] = 0.0 err_msg = "Variable 'var1': indexer {'dim2': 10} not available" with pytest.raises(ValueError, match=err_msg): data4[{"dim2": 10}] = data3[{"dim2": 2}] err_msg = "Variable 'var1': dimension 'dim2' appears in new values" with pytest.raises(KeyError, match=err_msg): data4[{"dim2": 2}] = data3[{"dim2": [2]}] err_msg = ( "Variable 'var2': dimension order differs between original and new data" ) data3["var2"] = data3["var2"].T with pytest.raises(ValueError, match=err_msg): data4[{"dim2": [2, 3]}] = data3[{"dim2": [2, 3]}] data3["var2"] = data3["var2"].T err_msg = r"cannot align objects.*not equal along these coordinates.*" with pytest.raises(ValueError, match=err_msg): data4[{"dim2": [2, 3]}] = data3[{"dim2": [2, 3, 4]}] err_msg = "Dataset assignment only accepts DataArrays, Datasets, and scalars." with pytest.raises(TypeError, match=err_msg): data4[{"dim2": [2, 3]}] = data3["var1"][{"dim2": [3, 4]}].values data5 = data4.astype(str) data5["var4"] = data4["var1"] # convert to `np.str_('a')` once `numpy<2.0` has been dropped err_msg = "could not convert string to float: .*'a'.*" with pytest.raises(ValueError, match=err_msg): data5[{"dim2": 1}] = "a" data4[{"dim2": 0}] = 0.0 data4[{"dim2": 1}] = data3[{"dim2": 2}] data4.loc[{"dim2": 1.5}] = 1.0 data4.loc[{"dim2": 2.0}] = data3.loc[{"dim2": 2.5}] for v, dat3 in data3.items(): dat4 = data4[v] assert_array_equal(dat4[{"dim2": 0}], 0.0) assert_array_equal(dat4[{"dim2": 1}], dat3[{"dim2": 2}]) assert_array_equal(dat4.loc[{"dim2": 1.5}], 1.0) assert_array_equal(dat4.loc[{"dim2": 2.0}], dat3.loc[{"dim2": 2.5}]) unchanged = [1.0, 2.5, 3.0, 3.5, 4.0] assert_identical( dat4.loc[{"dim2": unchanged}], dat3.loc[{"dim2": unchanged}] ) def test_setitem_pandas(self) -> None: ds = self.make_example_math_dataset() ds["x"] = np.arange(3) ds_copy = ds.copy() ds_copy["bar"] = ds["bar"].to_pandas() assert_equal(ds, ds_copy) def test_setitem_auto_align(self) -> None: ds = Dataset() ds["x"] = ("y", range(3)) ds["y"] = 1 + np.arange(3) expected = Dataset({"x": ("y", range(3)), "y": 1 + np.arange(3)}) assert_identical(ds, expected) ds["y"] = DataArray(range(3), dims="y") expected = Dataset({"x": ("y", range(3))}, {"y": range(3)}) assert_identical(ds, expected) ds["x"] = DataArray([1, 2], coords=[("y", [0, 1])]) expected = Dataset({"x": ("y", [1, 2, np.nan])}, {"y": range(3)}) assert_identical(ds, expected) ds["x"] = 42 expected = Dataset({"x": 42, "y": range(3)}) assert_identical(ds, expected) ds["x"] = DataArray([4, 5, 6, 7], coords=[("y", [0, 1, 2, 3])]) expected = Dataset({"x": ("y", [4, 5, 6])}, {"y": range(3)}) assert_identical(ds, expected) def test_setitem_dimension_override(self) -> None: # regression test for GH-3377 ds = xr.Dataset({"x": [0, 1, 2]}) ds["x"] = ds["x"][:2] expected = Dataset({"x": [0, 1]}) assert_identical(ds, expected) ds = xr.Dataset({"x": [0, 1, 2]}) ds["x"] = np.array([0, 1]) assert_identical(ds, expected) ds = xr.Dataset({"x": [0, 1, 2]}) ds.coords["x"] = [0, 1] assert_identical(ds, expected) def test_setitem_with_coords(self) -> None: # Regression test for GH:2068 ds = create_test_data() other = DataArray( np.arange(10), dims="dim3", coords={"numbers": ("dim3", np.arange(10))} ) expected = ds.copy() expected["var3"] = other.drop_vars("numbers") actual = ds.copy() actual["var3"] = other assert_identical(expected, actual) assert "numbers" in other.coords # should not change other # with alignment other = ds["var3"].isel(dim3=slice(1, -1)) other["numbers"] = ("dim3", np.arange(8)) actual = ds.copy() actual["var3"] = other assert "numbers" in other.coords # should not change other expected = ds.copy() expected["var3"] = ds["var3"].isel(dim3=slice(1, -1)) assert_identical(expected, actual) # with non-duplicate coords other = ds["var3"].isel(dim3=slice(1, -1)) other["numbers"] = ("dim3", np.arange(8)) other["position"] = ("dim3", np.arange(8)) actual = ds.copy() actual["var3"] = other assert "position" in actual assert "position" in other.coords # assigning a coordinate-only dataarray actual = ds.copy() other = actual["numbers"] other[0] = 10 actual["numbers"] = other assert actual["numbers"][0] == 10 # GH: 2099 ds = Dataset( {"var": ("x", [1, 2, 3])}, coords={"x": [0, 1, 2], "z1": ("x", [1, 2, 3]), "z2": ("x", [1, 2, 3])}, ) ds["var"] = ds["var"] * 2 assert np.allclose(ds["var"], [2, 4, 6]) def test_setitem_align_new_indexes(self) -> None: ds = Dataset({"foo": ("x", [1, 2, 3])}, {"x": [0, 1, 2]}) ds["bar"] = DataArray([2, 3, 4], [("x", [1, 2, 3])]) expected = Dataset( {"foo": ("x", [1, 2, 3]), "bar": ("x", [np.nan, 2, 3])}, {"x": [0, 1, 2]} ) assert_identical(ds, expected) def test_setitem_vectorized(self) -> None: # Regression test for GH:7030 # Positional indexing da = xr.DataArray(np.r_[:120].reshape(2, 3, 4, 5), dims=["a", "b", "c", "d"]) ds = xr.Dataset({"da": da}) b = xr.DataArray([[0, 0], [1, 0]], dims=["u", "v"]) c = xr.DataArray([[0, 1], [2, 3]], dims=["u", "v"]) w = xr.DataArray([-1, -2], dims=["u"]) index = dict(b=b, c=c) ds[index] = xr.Dataset({"da": w}) assert (ds[index]["da"] == w).all() # Indexing with coordinates da = xr.DataArray(np.r_[:120].reshape(2, 3, 4, 5), dims=["a", "b", "c", "d"]) ds = xr.Dataset({"da": da}) ds.coords["b"] = [2, 4, 6] b = xr.DataArray([[2, 2], [4, 2]], dims=["u", "v"]) c = xr.DataArray([[0, 1], [2, 3]], dims=["u", "v"]) w = xr.DataArray([-1, -2], dims=["u"]) index = dict(b=b, c=c) ds.loc[index] = xr.Dataset({"da": w}, coords={"b": ds.coords["b"]}) assert (ds.loc[index]["da"] == w).all() @pytest.mark.parametrize("dtype", [str, bytes]) def test_setitem_str_dtype(self, dtype) -> None: ds = xr.Dataset(coords={"x": np.array(["x", "y"], dtype=dtype)}) # test Dataset update ds["foo"] = xr.DataArray(np.array([0, 0]), dims=["x"]) assert np.issubdtype(ds.x.dtype, dtype) def test_setitem_using_list(self) -> None: # assign a list of variables var1 = Variable(["dim1"], np.random.randn(8)) var2 = Variable(["dim1"], np.random.randn(8)) actual = create_test_data() expected = actual.copy() expected["A"] = var1 expected["B"] = var2 actual[["A", "B"]] = [var1, var2] assert_identical(actual, expected) # assign a list of dataset arrays dv = 2 * expected[["A", "B"]] actual[["C", "D"]] = [d.variable for d in dv.data_vars.values()] expected[["C", "D"]] = dv assert_identical(actual, expected) @pytest.mark.parametrize( "var_list, data, error_regex", [ ( ["A", "B"], [Variable(["dim1"], np.random.randn(8))], r"Different lengths", ), ([], [Variable(["dim1"], np.random.randn(8))], r"Empty list of variables"), (["A", "B"], xr.DataArray([1, 2]), r"assign single DataArray"), ], ) def test_setitem_using_list_errors(self, var_list, data, error_regex) -> None: actual = create_test_data() with pytest.raises(ValueError, match=error_regex): actual[var_list] = data def test_assign(self) -> None: ds = Dataset() actual = ds.assign(x=[0, 1, 2], y=2) expected = Dataset({"x": [0, 1, 2], "y": 2}) assert_identical(actual, expected) assert list(actual.variables) == ["x", "y"] assert_identical(ds, Dataset()) actual = actual.assign(y=lambda ds: ds.x**2) expected = Dataset({"y": ("x", [0, 1, 4]), "x": [0, 1, 2]}) assert_identical(actual, expected) actual = actual.assign_coords(z=2) expected = Dataset({"y": ("x", [0, 1, 4])}, {"z": 2, "x": [0, 1, 2]}) assert_identical(actual, expected) def test_assign_coords(self) -> None: ds = Dataset() actual = ds.assign(x=[0, 1, 2], y=2) actual = actual.assign_coords(x=list("abc")) expected = Dataset({"x": list("abc"), "y": 2}) assert_identical(actual, expected) actual = ds.assign(x=[0, 1, 2], y=[2, 3]) actual = actual.assign_coords({"y": [2.0, 3.0]}) expected = ds.assign(x=[0, 1, 2], y=[2.0, 3.0]) assert_identical(actual, expected) def test_assign_attrs(self) -> None: expected = Dataset(attrs=dict(a=1, b=2)) new = Dataset() actual = new.assign_attrs(a=1, b=2) assert_identical(actual, expected) assert new.attrs == {} expected.attrs["c"] = 3 new_actual = actual.assign_attrs({"c": 3}) assert_identical(new_actual, expected) assert actual.attrs == dict(a=1, b=2) def test_drop_attrs(self) -> None: # Simple example ds = Dataset().assign_attrs(a=1, b=2) original = ds.copy() expected = Dataset() result = ds.drop_attrs() assert_identical(result, expected) # Doesn't change original assert_identical(ds, original) # Example with variables and coords with attrs, and a multiindex. (arguably # should have used a canonical dataset with all the features we're should # support...) var = Variable("x", [1, 2, 3], attrs=dict(x=1, y=2)) idx = IndexVariable("y", [1, 2, 3], attrs=dict(c=1, d=2)) mx = xr.Coordinates.from_pandas_multiindex( pd.MultiIndex.from_tuples([(1, 2), (3, 4)], names=["d", "e"]), "z" ) ds = Dataset(dict(var1=var), coords=dict(y=idx, z=mx)).assign_attrs(a=1, b=2) assert ds.attrs != {} assert ds["var1"].attrs != {} assert ds["y"].attrs != {} assert ds.coords["y"].attrs != {} original = ds.copy(deep=True) result = ds.drop_attrs() assert result.attrs == {} assert result["var1"].attrs == {} assert result["y"].attrs == {} assert list(result.data_vars) == list(ds.data_vars) assert list(result.coords) == list(ds.coords) # Doesn't change original assert_identical(ds, original) # Specifically test that the attrs on the coords are still there. (The index # can't currently contain `attrs`, so we can't test those.) assert ds.coords["y"].attrs != {} # Test for deep=False result_shallow = ds.drop_attrs(deep=False) assert result_shallow.attrs == {} assert result_shallow["var1"].attrs != {} assert result_shallow["y"].attrs != {} assert list(result.data_vars) == list(ds.data_vars) assert list(result.coords) == list(ds.coords) def test_assign_multiindex_level(self) -> None: data = create_test_multiindex() with pytest.raises(ValueError, match=r"cannot drop or update.*corrupt.*index "): data.assign(level_1=range(4)) data.assign_coords(level_1=range(4)) def test_assign_new_multiindex(self) -> None: midx = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [0, 1, 0, 1]]) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") ds = Dataset(coords={"x": [1, 2]}) expected = Dataset(coords=midx_coords) with pytest.warns( FutureWarning, match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): actual = ds.assign(x=midx) assert_identical(actual, expected) @pytest.mark.parametrize("orig_coords", [{}, {"x": range(4)}]) def test_assign_coords_new_multiindex(self, orig_coords) -> None: ds = Dataset(coords=orig_coords) midx = pd.MultiIndex.from_arrays( [["a", "a", "b", "b"], [0, 1, 0, 1]], names=("one", "two") ) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") expected = Dataset(coords=midx_coords) with pytest.warns( FutureWarning, match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): actual = ds.assign_coords({"x": midx}) assert_identical(actual, expected) actual = ds.assign_coords(midx_coords) assert_identical(actual, expected) def test_assign_coords_existing_multiindex(self) -> None: data = create_test_multiindex() with pytest.warns( FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" ): updated = data.assign_coords(x=range(4)) # https://github.com/pydata/xarray/issues/7097 (coord names updated) assert len(updated.coords) == 1 with pytest.warns( FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" ): updated = data.assign(x=range(4)) # https://github.com/pydata/xarray/issues/7097 (coord names updated) assert len(updated.coords) == 1 def test_assign_all_multiindex_coords(self) -> None: data = create_test_multiindex() actual = data.assign(x=range(4), level_1=range(4), level_2=range(4)) # no error but multi-index dropped in favor of single indexes for each level assert ( actual.xindexes["x"] is not actual.xindexes["level_1"] is not actual.xindexes["level_2"] ) def test_assign_coords_custom_index_side_effect(self) -> None: # test that assigning new coordinates do not reset other dimension coord indexes # to default (pandas) index (https://github.com/pydata/xarray/issues/7346) class CustomIndex(PandasIndex): pass ds = ( Dataset(coords={"x": [1, 2, 3]}) .drop_indexes("x") .set_xindex("x", CustomIndex) ) actual = ds.assign_coords(y=[4, 5, 6]) assert isinstance(actual.xindexes["x"], CustomIndex) def test_assign_coords_custom_index(self) -> None: class CustomIndex(Index): pass coords = Coordinates( coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} ) ds = Dataset() actual = ds.assign_coords(coords) assert isinstance(actual.xindexes["x"], CustomIndex) def test_assign_coords_no_default_index(self) -> None: coords = Coordinates({"y": [1, 2, 3]}, indexes={}) ds = Dataset() actual = ds.assign_coords(coords) expected = coords.to_dataset() assert_identical(expected, actual, check_default_indexes=False) assert "y" not in actual.xindexes def test_merge_multiindex_level(self) -> None: data = create_test_multiindex() other = Dataset({"level_1": ("x", [0, 1])}) with pytest.raises(ValueError, match=r".*conflicting dimension sizes.*"): data.merge(other) other = Dataset({"level_1": ("x", range(4))}) with pytest.raises( ValueError, match=r"unable to determine.*coordinates or not.*" ): data.merge(other) # `other` Dataset coordinates are ignored (bug or feature?) other = Dataset(coords={"level_1": ("x", range(4))}) assert_identical(data.merge(other), data) def test_setitem_original_non_unique_index(self) -> None: # regression test for GH943 original = Dataset({"data": ("x", np.arange(5))}, coords={"x": [0, 1, 2, 0, 1]}) expected = Dataset({"data": ("x", np.arange(5))}, {"x": range(5)}) actual = original.copy() actual["x"] = list(range(5)) assert_identical(actual, expected) actual = original.copy() actual["x"] = ("x", list(range(5))) assert_identical(actual, expected) actual = original.copy() actual.coords["x"] = list(range(5)) assert_identical(actual, expected) def test_setitem_both_non_unique_index(self) -> None: # regression test for GH956 names = ["joaquin", "manolo", "joaquin"] values = np.random.randint(0, 256, (3, 4, 4)) array = DataArray( values, dims=["name", "row", "column"], coords=[names, range(4), range(4)] ) expected = Dataset({"first": array, "second": array}) actual = array.rename("first").to_dataset() actual["second"] = array assert_identical(expected, actual) def test_setitem_multiindex_level(self) -> None: data = create_test_multiindex() with pytest.raises( ValueError, match=r"cannot set or update variable.*corrupt.*index " ): data["level_1"] = range(4) def test_delitem(self) -> None: data = create_test_data() all_items = set(data.variables) assert set(data.variables) == all_items del data["var1"] assert set(data.variables) == all_items - {"var1"} del data["numbers"] assert set(data.variables) == all_items - {"var1", "numbers"} assert "numbers" not in data.coords expected = Dataset() actual = Dataset({"y": ("x", [1, 2])}) del actual["y"] assert_identical(expected, actual) def test_delitem_multiindex_level(self) -> None: data = create_test_multiindex() with pytest.raises( ValueError, match=r"cannot remove coordinate.*corrupt.*index " ): del data["level_1"] def test_squeeze(self) -> None: data = Dataset({"foo": (["x", "y", "z"], [[[1], [2]]])}) test_args: list[list] = [[], [["x"]], [["x", "z"]]] for args in test_args: def get_args(args, v): return [set(args[0]) & set(v.dims)] if args else [] expected = Dataset( {k: v.squeeze(*get_args(args, v)) for k, v in data.variables.items()} ) expected = expected.set_coords(data.coords) assert_identical(expected, data.squeeze(*args)) # invalid squeeze with pytest.raises(ValueError, match=r"cannot select a dimension"): data.squeeze("y") def test_squeeze_drop(self) -> None: data = Dataset({"foo": ("x", [1])}, {"x": [0]}) expected = Dataset({"foo": 1}) selected = data.squeeze(drop=True) assert_identical(expected, selected) expected = Dataset({"foo": 1}, {"x": 0}) selected = data.squeeze(drop=False) assert_identical(expected, selected) data = Dataset({"foo": (("x", "y"), [[1]])}, {"x": [0], "y": [0]}) expected = Dataset({"foo": 1}) selected = data.squeeze(drop=True) assert_identical(expected, selected) expected = Dataset({"foo": ("x", [1])}, {"x": [0]}) selected = data.squeeze(dim="y", drop=True) assert_identical(expected, selected) data = Dataset({"foo": (("x",), [])}, {"x": []}) selected = data.squeeze(drop=True) assert_identical(data, selected) def test_to_dataarray(self) -> None: ds = Dataset( {"a": 1, "b": ("x", [1, 2, 3])}, coords={"c": 42}, attrs={"Conventions": "None"}, ) data = [[1, 1, 1], [1, 2, 3]] coords = {"c": 42, "variable": ["a", "b"]} dims = ("variable", "x") expected = DataArray(data, coords, dims, attrs=ds.attrs) actual = ds.to_dataarray() assert_identical(expected, actual) actual = ds.to_dataarray("abc", name="foo") expected = expected.rename({"variable": "abc"}).rename("foo") assert_identical(expected, actual) def test_to_and_from_dataframe(self) -> None: x = np.random.randn(10) y = np.random.randn(10) t = list("abcdefghij") cat = pd.Categorical(["a", "b"] * 5) ds = Dataset({"a": ("t", x), "b": ("t", y), "t": ("t", t), "cat": ("t", cat)}) expected = pd.DataFrame( np.array([x, y]).T, columns=["a", "b"], index=pd.Index(t, name="t") ) expected["cat"] = cat actual = ds.to_dataframe() # use the .equals method to check all DataFrame metadata assert expected.equals(actual), (expected, actual) # verify coords are included actual = ds.set_coords("b").to_dataframe() assert expected.equals(actual), (expected, actual) # check roundtrip assert_identical(ds, Dataset.from_dataframe(actual)) assert isinstance(ds["cat"].variable.data.dtype, pd.CategoricalDtype) # test a case with a MultiIndex w = np.random.randn(2, 3) cat = pd.Categorical(["a", "a", "c"]) ds = Dataset({"w": (("x", "y"), w), "cat": ("y", cat)}) ds["y"] = ("y", list("abc")) exp_index = pd.MultiIndex.from_arrays( [[0, 0, 0, 1, 1, 1], ["a", "b", "c", "a", "b", "c"]], names=["x", "y"] ) expected = pd.DataFrame( {"w": w.reshape(-1), "cat": pd.Categorical(["a", "a", "c", "a", "a", "c"])}, index=exp_index, ) actual = ds.to_dataframe() assert expected.equals(actual) # check roundtrip # from_dataframe attempts to broadcast across because it doesn't know better, so cat must be converted ds["cat"] = (("x", "y"), np.stack((ds["cat"].to_numpy(), ds["cat"].to_numpy()))) assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual)) # Check multiindex reordering new_order = ["x", "y"] # revert broadcasting fix above for 1d arrays ds["cat"] = ("y", cat) actual = ds.to_dataframe(dim_order=new_order) assert expected.equals(actual) new_order = ["y", "x"] exp_index = pd.MultiIndex.from_arrays( [["a", "a", "b", "b", "c", "c"], [0, 1, 0, 1, 0, 1]], names=["y", "x"] ) expected = pd.DataFrame( { "w": w.transpose().reshape(-1), "cat": pd.Categorical(["a", "a", "a", "a", "c", "c"]), }, index=exp_index, ) actual = ds.to_dataframe(dim_order=new_order) assert expected.equals(actual) invalid_order = ["x"] with pytest.raises( ValueError, match="does not match the set of dimensions of this" ): ds.to_dataframe(dim_order=invalid_order) invalid_order = ["x", "z"] with pytest.raises( ValueError, match="does not match the set of dimensions of this" ): ds.to_dataframe(dim_order=invalid_order) # check pathological cases df = pd.DataFrame([1]) actual_ds = Dataset.from_dataframe(df) expected_ds = Dataset({0: ("index", [1])}, {"index": [0]}) assert_identical(expected_ds, actual_ds) df = pd.DataFrame() actual_ds = Dataset.from_dataframe(df) expected_ds = Dataset(coords={"index": []}) assert_identical(expected_ds, actual_ds) # GH697 df = pd.DataFrame({"A": []}) actual_ds = Dataset.from_dataframe(df) expected_ds = Dataset({"A": DataArray([], dims=("index",))}, {"index": []}) assert_identical(expected_ds, actual_ds) # regression test for GH278 # use int64 to ensure consistent results for the pandas .equals method # on windows (which requires the same dtype) ds = Dataset({"x": pd.Index(["bar"]), "a": ("y", np.array([1], "int64"))}).isel( x=0 ) # use .loc to ensure consistent results on Python 3 actual = ds.to_dataframe().loc[:, ["a", "x"]] expected = pd.DataFrame( [[1, "bar"]], index=pd.Index([0], name="y"), columns=["a", "x"] ) assert expected.equals(actual), (expected, actual) ds = Dataset({"x": np.array([0], "int64"), "y": np.array([1], "int64")}) actual = ds.to_dataframe() idx = pd.MultiIndex.from_arrays([[0], [1]], names=["x", "y"]) expected = pd.DataFrame([[]], index=idx) assert expected.equals(actual), (expected, actual) def test_from_dataframe_categorical_index(self) -> None: cat = pd.CategoricalDtype( categories=["foo", "bar", "baz", "qux", "quux", "corge"] ) i1 = pd.Series(["foo", "bar", "foo"], dtype=cat) i2 = pd.Series(["bar", "bar", "baz"], dtype=cat) df = pd.DataFrame({"i1": i1, "i2": i2, "values": [1, 2, 3]}) ds = df.set_index("i1").to_xarray() assert len(ds["i1"]) == 3 ds = df.set_index(["i1", "i2"]).to_xarray() assert len(ds["i1"]) == 2 assert len(ds["i2"]) == 2 def test_from_dataframe_categorical_index_string_categories(self) -> None: cat = pd.CategoricalIndex( pd.Categorical.from_codes( np.array([1, 1, 0, 2], dtype=np.int64), # type: ignore[arg-type] categories=pd.Index(["foo", "bar", "baz"], dtype="string"), ) ) ser = pd.Series(1, index=cat) ds = ser.to_xarray() assert ds.coords.dtypes["index"] == np.dtype("O") @requires_sparse def test_from_dataframe_sparse(self) -> None: import sparse df_base = pd.DataFrame( {"x": range(10), "y": list("abcdefghij"), "z": np.arange(0, 100, 10)} ) ds_sparse = Dataset.from_dataframe(df_base.set_index("x"), sparse=True) ds_dense = Dataset.from_dataframe(df_base.set_index("x"), sparse=False) assert isinstance(ds_sparse["y"].data, sparse.COO) assert isinstance(ds_sparse["z"].data, sparse.COO) ds_sparse["y"].data = ds_sparse["y"].data.todense() ds_sparse["z"].data = ds_sparse["z"].data.todense() assert_identical(ds_dense, ds_sparse) ds_sparse = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=True) ds_dense = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=False) assert isinstance(ds_sparse["z"].data, sparse.COO) ds_sparse["z"].data = ds_sparse["z"].data.todense() assert_identical(ds_dense, ds_sparse) def test_to_and_from_empty_dataframe(self) -> None: # GH697 expected = pd.DataFrame({"foo": []}) ds = Dataset.from_dataframe(expected) assert len(ds["foo"]) == 0 actual = ds.to_dataframe() assert len(actual) == 0 assert expected.equals(actual) def test_from_dataframe_multiindex(self) -> None: index = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]], names=["x", "y"]) df = pd.DataFrame({"z": np.arange(6)}, index=index) expected = Dataset( {"z": (("x", "y"), [[0, 1, 2], [3, 4, 5]])}, coords={"x": ["a", "b"], "y": [1, 2, 3]}, ) actual = Dataset.from_dataframe(df) assert_identical(actual, expected) df2 = df.iloc[[3, 2, 1, 0, 4, 5], :] actual = Dataset.from_dataframe(df2) assert_identical(actual, expected) df3 = df.iloc[:4, :] expected3 = Dataset( {"z": (("x", "y"), [[0, 1, 2], [3, np.nan, np.nan]])}, coords={"x": ["a", "b"], "y": [1, 2, 3]}, ) actual = Dataset.from_dataframe(df3) assert_identical(actual, expected3) df_nonunique = df.iloc[[0, 0], :] with pytest.raises(ValueError, match=r"non-unique MultiIndex"): Dataset.from_dataframe(df_nonunique) def test_from_dataframe_unsorted_levels(self) -> None: # regression test for GH-4186 index = pd.MultiIndex( levels=[["b", "a"], ["foo"]], codes=[[0, 1], [0, 0]], names=["lev1", "lev2"] ) df = pd.DataFrame({"c1": [0, 2], "c2": [1, 3]}, index=index) expected = Dataset( { "c1": (("lev1", "lev2"), [[0], [2]]), "c2": (("lev1", "lev2"), [[1], [3]]), }, coords={"lev1": ["b", "a"], "lev2": ["foo"]}, ) actual = Dataset.from_dataframe(df) assert_identical(actual, expected) def test_from_dataframe_non_unique_columns(self) -> None: # regression test for GH449 df = pd.DataFrame(np.zeros((2, 2))) df.columns = ["foo", "foo"] # type: ignore[assignment] with pytest.raises(ValueError, match=r"non-unique columns"): Dataset.from_dataframe(df) def test_convert_dataframe_with_many_types_and_multiindex(self) -> None: # regression test for GH737 df = pd.DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="America/New_York"), } ) df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) roundtripped = Dataset.from_dataframe(df).to_dataframe() # we can't do perfectly, but we should be at least as faithful as # np.asarray expected = df.apply(np.asarray) assert roundtripped.equals(expected) @pytest.mark.parametrize("encoding", [True, False]) @pytest.mark.parametrize("data", [True, "list", "array"]) def test_to_and_from_dict( self, encoding: bool, data: bool | Literal["list", "array"] ) -> None: # # Dimensions: (t: 10) # Coordinates: # * t (t) U1" expected_no_data["coords"]["t"].update({"dtype": endiantype, "shape": (10,)}) expected_no_data["data_vars"]["a"].update({"dtype": "float64", "shape": (10,)}) expected_no_data["data_vars"]["b"].update({"dtype": "float64", "shape": (10,)}) actual_no_data = ds.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data # verify coords are included roundtrip expected_ds = ds.set_coords("b") actual2 = Dataset.from_dict(expected_ds.to_dict(data=data, encoding=encoding)) assert_identical(expected_ds, actual2) if encoding: assert set(expected_ds.variables) == set(actual2.variables) for vv in ds.variables: np.testing.assert_equal(expected_ds[vv].encoding, actual2[vv].encoding) # test some incomplete dicts: # this one has no attrs field, the dims are strings, and x, y are # np.arrays d = { "coords": {"t": {"dims": "t", "data": t}}, "dims": "t", "data_vars": {"a": {"dims": "t", "data": x}, "b": {"dims": "t", "data": y}}, } assert_identical(ds, Dataset.from_dict(d)) # this is kind of a flattened version with no coords, or data_vars d = { "a": {"dims": "t", "data": x}, "t": {"data": t, "dims": "t"}, "b": {"dims": "t", "data": y}, } assert_identical(ds, Dataset.from_dict(d)) # this one is missing some necessary information d = { "a": {"data": x}, "t": {"data": t, "dims": "t"}, "b": {"dims": "t", "data": y}, } with pytest.raises( ValueError, match=r"cannot convert dict without the key 'dims'" ): Dataset.from_dict(d) def test_to_and_from_dict_with_time_dim(self) -> None: x = np.random.randn(10, 3) y = np.random.randn(10, 3) t = pd.date_range("20130101", periods=10) lat = [77.7, 83.2, 76] ds = Dataset( { "a": (["t", "lat"], x), "b": (["t", "lat"], y), "t": ("t", t), "lat": ("lat", lat), } ) roundtripped = Dataset.from_dict(ds.to_dict()) assert_identical(ds, roundtripped) @pytest.mark.parametrize("data", [True, "list", "array"]) def test_to_and_from_dict_with_nan_nat( self, data: bool | Literal["list", "array"] ) -> None: x = np.random.randn(10, 3) y = np.random.randn(10, 3) y[2] = np.nan t = pd.Series(pd.date_range("20130101", periods=10)) t[2] = np.nan lat = [77.7, 83.2, 76] ds = Dataset( { "a": (["t", "lat"], x), "b": (["t", "lat"], y), "t": ("t", t), "lat": ("lat", lat), } ) roundtripped = Dataset.from_dict(ds.to_dict(data=data)) assert_identical(ds, roundtripped) def test_to_dict_with_numpy_attrs(self) -> None: # this doesn't need to roundtrip x = np.random.randn(10) y = np.random.randn(10) t = list("abcdefghij") attrs = { "created": np.float64(1998), "coords": np.array([37, -110.1, 100]), "maintainer": "bar", } ds = Dataset({"a": ("t", x, attrs), "b": ("t", y, attrs), "t": ("t", t)}) expected_attrs = { "created": attrs["created"].item(), # type: ignore[attr-defined] "coords": attrs["coords"].tolist(), # type: ignore[attr-defined] "maintainer": "bar", } actual = ds.to_dict() # check that they are identical assert expected_attrs == actual["data_vars"]["a"]["attrs"] def test_pickle(self) -> None: data = create_test_data() roundtripped = pickle.loads(pickle.dumps(data)) assert_identical(data, roundtripped) # regression test for #167: assert data.sizes == roundtripped.sizes def test_lazy_load(self) -> None: store = InaccessibleVariableDataStore() create_test_data().dump_to_store(store) for decode_cf in [True, False]: ds = open_dataset(store, decode_cf=decode_cf) with pytest.raises(UnexpectedDataAccess): ds.load() with pytest.raises(UnexpectedDataAccess): _ = ds["var1"].values # these should not raise UnexpectedDataAccess: ds.isel(time=10) ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) def test_lazy_load_duck_array(self) -> None: store = AccessibleAsDuckArrayDataStore() create_test_data().dump_to_store(store) for decode_cf in [True, False]: ds = open_dataset(store, decode_cf=decode_cf) with pytest.raises(UnexpectedDataAccess): _ = ds["var1"].values # these should not raise UnexpectedDataAccess: _ = ds.var1.data ds.isel(time=10) ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) repr(ds) # preserve the duck array type and don't cast to array assert isinstance(ds["var1"].load().data, DuckArrayWrapper) assert isinstance( ds["var1"].isel(dim2=0, dim1=0).load().data, DuckArrayWrapper ) ds.close() def test_dropna(self) -> None: x = np.random.randn(4, 4) x[::2, 0] = np.nan y = np.random.randn(4) y[-1] = np.nan ds = Dataset({"foo": (("a", "b"), x), "bar": (("b", y))}) expected = ds.isel(a=slice(1, None, 2)) actual = ds.dropna("a") assert_identical(actual, expected) expected = ds.isel(b=slice(1, 3)) actual = ds.dropna("b") assert_identical(actual, expected) actual = ds.dropna("b", subset=["foo", "bar"]) assert_identical(actual, expected) expected = ds.isel(b=slice(1, None)) actual = ds.dropna("b", subset=["foo"]) assert_identical(actual, expected) expected = ds.isel(b=slice(3)) actual = ds.dropna("b", subset=["bar"]) assert_identical(actual, expected) actual = ds.dropna("a", subset=[]) assert_identical(actual, ds) actual = ds.dropna("a", subset=["bar"]) assert_identical(actual, ds) actual = ds.dropna("a", how="all") assert_identical(actual, ds) actual = ds.dropna("b", how="all", subset=["bar"]) expected = ds.isel(b=[0, 1, 2]) assert_identical(actual, expected) actual = ds.dropna("b", thresh=1, subset=["bar"]) assert_identical(actual, expected) actual = ds.dropna("b", thresh=2) assert_identical(actual, ds) actual = ds.dropna("b", thresh=4) expected = ds.isel(b=[1, 2, 3]) assert_identical(actual, expected) actual = ds.dropna("a", thresh=3) expected = ds.isel(a=[1, 3]) assert_identical(actual, ds) with pytest.raises( ValueError, match=r"'foo' not found in data dimensions \('a', 'b'\)", ): ds.dropna("foo") with pytest.raises(ValueError, match=r"invalid how"): ds.dropna("a", how="somehow") # type: ignore[arg-type] with pytest.raises(TypeError, match=r"must specify how or thresh"): ds.dropna("a", how=None) # type: ignore[arg-type] def test_fillna(self) -> None: ds = Dataset({"a": ("x", [np.nan, 1, np.nan, 3])}, {"x": [0, 1, 2, 3]}) # fill with -1 actual1 = ds.fillna(-1) expected = Dataset({"a": ("x", [-1, 1, -1, 3])}, {"x": [0, 1, 2, 3]}) assert_identical(expected, actual1) actual2 = ds.fillna({"a": -1}) assert_identical(expected, actual2) other = Dataset({"a": -1}) actual3 = ds.fillna(other) assert_identical(expected, actual3) actual4 = ds.fillna({"a": other.a}) assert_identical(expected, actual4) # fill with range(4) b = DataArray(range(4), coords=[("x", range(4))]) actual5 = ds.fillna(b) expected = b.rename("a").to_dataset() assert_identical(expected, actual5) actual6 = ds.fillna(expected) assert_identical(expected, actual6) actual7 = ds.fillna(np.arange(4)) assert_identical(expected, actual7) actual8 = ds.fillna(b[:3]) assert_identical(expected, actual8) # okay to only include some data variables ds["b"] = np.nan actual9 = ds.fillna({"a": -1}) expected = Dataset( {"a": ("x", [-1, 1, -1, 3]), "b": np.nan}, {"x": [0, 1, 2, 3]} ) assert_identical(expected, actual9) # but new data variables is not okay with pytest.raises(ValueError, match=r"must be contained"): ds.fillna({"x": 0}) # empty argument should be OK result1 = ds.fillna({}) assert_identical(ds, result1) result2 = ds.fillna(Dataset(coords={"c": 42})) expected = ds.assign_coords(c=42) assert_identical(expected, result2) da = DataArray(range(5), name="a", attrs={"attr": "da"}) actual10 = da.fillna(1) assert actual10.name == "a" assert actual10.attrs == da.attrs ds = Dataset({"a": da}, attrs={"attr": "ds"}) actual11 = ds.fillna({"a": 1}) assert actual11.attrs == ds.attrs assert actual11.a.name == "a" assert actual11.a.attrs == ds.a.attrs @pytest.mark.parametrize( "func", [lambda x: x.clip(0, 1), lambda x: np.float64(1.0) * x, np.abs, abs] ) def test_propagate_attrs(self, func) -> None: da = DataArray(range(5), name="a", attrs={"attr": "da"}) ds = Dataset({"a": da}, attrs={"attr": "ds"}) # test defaults assert func(ds).attrs == ds.attrs with set_options(keep_attrs=False): assert func(ds).attrs != ds.attrs assert func(ds).a.attrs != ds.a.attrs with set_options(keep_attrs=False): assert func(ds).attrs != ds.attrs assert func(ds).a.attrs != ds.a.attrs with set_options(keep_attrs=True): assert func(ds).attrs == ds.attrs assert func(ds).a.attrs == ds.a.attrs def test_where(self) -> None: ds = Dataset({"a": ("x", range(5))}) expected1 = Dataset({"a": ("x", [np.nan, np.nan, 2, 3, 4])}) actual1 = ds.where(ds > 1) assert_identical(expected1, actual1) actual2 = ds.where(ds.a > 1) assert_identical(expected1, actual2) actual3 = ds.where(ds.a.values > 1) assert_identical(expected1, actual3) actual4 = ds.where(True) assert_identical(ds, actual4) expected5 = ds.copy(deep=True) expected5["a"].values = np.array([np.nan] * 5) actual5 = ds.where(False) assert_identical(expected5, actual5) # 2d ds = Dataset({"a": (("x", "y"), [[0, 1], [2, 3]])}) expected6 = Dataset({"a": (("x", "y"), [[np.nan, 1], [2, 3]])}) actual6 = ds.where(ds > 0) assert_identical(expected6, actual6) # attrs da = DataArray(range(5), name="a", attrs={"attr": "da"}) actual7 = da.where(da.values > 1) assert actual7.name == "a" assert actual7.attrs == da.attrs ds = Dataset({"a": da}, attrs={"attr": "ds"}) actual8 = ds.where(ds > 0) assert actual8.attrs == ds.attrs assert actual8.a.name == "a" assert actual8.a.attrs == ds.a.attrs # lambda ds = Dataset({"a": ("x", range(5))}) expected9 = Dataset({"a": ("x", [np.nan, np.nan, 2, 3, 4])}) actual9 = ds.where(lambda x: x > 1) assert_identical(expected9, actual9) def test_where_other(self) -> None: ds = Dataset({"a": ("x", range(5))}, {"x": range(5)}) expected = Dataset({"a": ("x", [-1, -1, 2, 3, 4])}, {"x": range(5)}) actual = ds.where(ds > 1, -1) assert_equal(expected, actual) assert actual.a.dtype == int actual = ds.where(lambda x: x > 1, -1) assert_equal(expected, actual) actual = ds.where(ds > 1, other=-1, drop=True) expected_nodrop = ds.where(ds > 1, -1) _, expected = xr.align(actual, expected_nodrop, join="left") assert_equal(actual, expected) assert actual.a.dtype == int with pytest.raises(ValueError, match=r"cannot align .* are not equal"): ds.where(ds > 1, ds.isel(x=slice(3))) with pytest.raises(ValueError, match=r"exact match required"): ds.where(ds > 1, ds.assign(b=2)) def test_where_drop(self) -> None: # if drop=True # 1d # data array case array = DataArray(range(5), coords=[range(5)], dims=["x"]) expected1 = DataArray(range(5)[2:], coords=[range(5)[2:]], dims=["x"]) actual1 = array.where(array > 1, drop=True) assert_identical(expected1, actual1) # dataset case ds = Dataset({"a": array}) expected2 = Dataset({"a": expected1}) actual2 = ds.where(ds > 1, drop=True) assert_identical(expected2, actual2) actual3 = ds.where(ds.a > 1, drop=True) assert_identical(expected2, actual3) with pytest.raises(TypeError, match=r"must be a"): ds.where(np.arange(5) > 1, drop=True) # 1d with odd coordinates array = DataArray( np.array([2, 7, 1, 8, 3]), coords=[np.array([3, 1, 4, 5, 9])], dims=["x"] ) expected4 = DataArray( np.array([7, 8, 3]), coords=[np.array([1, 5, 9])], dims=["x"] ) actual4 = array.where(array > 2, drop=True) assert_identical(expected4, actual4) # 1d multiple variables ds = Dataset({"a": (("x"), [0, 1, 2, 3]), "b": (("x"), [4, 5, 6, 7])}) expected5 = Dataset( {"a": (("x"), [np.nan, 1, 2, 3]), "b": (("x"), [4, 5, 6, np.nan])} ) actual5 = ds.where((ds > 0) & (ds < 7), drop=True) assert_identical(expected5, actual5) # 2d ds = Dataset({"a": (("x", "y"), [[0, 1], [2, 3]])}) expected6 = Dataset({"a": (("x", "y"), [[np.nan, 1], [2, 3]])}) actual6 = ds.where(ds > 0, drop=True) assert_identical(expected6, actual6) # 2d with odd coordinates ds = Dataset( {"a": (("x", "y"), [[0, 1], [2, 3]])}, coords={ "x": [4, 3], "y": [1, 2], "z": (["x", "y"], [[np.e, np.pi], [np.pi * np.e, np.pi * 3]]), }, ) expected7 = Dataset( {"a": (("x", "y"), [[3]])}, coords={"x": [3], "y": [2], "z": (["x", "y"], [[np.pi * 3]])}, ) actual7 = ds.where(ds > 2, drop=True) assert_identical(expected7, actual7) # 2d multiple variables ds = Dataset( {"a": (("x", "y"), [[0, 1], [2, 3]]), "b": (("x", "y"), [[4, 5], [6, 7]])} ) expected8 = Dataset( { "a": (("x", "y"), [[np.nan, 1], [2, 3]]), "b": (("x", "y"), [[4, 5], [6, 7]]), } ) actual8 = ds.where(ds > 0, drop=True) assert_identical(expected8, actual8) # mixed dimensions: PR#6690, Issue#6227 ds = xr.Dataset( { "a": ("x", [1, 2, 3]), "b": ("y", [2, 3, 4]), "c": (("x", "y"), np.arange(9).reshape((3, 3))), } ) expected9 = xr.Dataset( { "a": ("x", [np.nan, 3]), "b": ("y", [np.nan, 3, 4]), "c": (("x", "y"), np.arange(3.0, 9.0).reshape((2, 3))), } ) actual9 = ds.where(ds > 2, drop=True) assert actual9.sizes["x"] == 2 assert_identical(expected9, actual9) def test_where_drop_empty(self) -> None: # regression test for GH1341 array = DataArray(np.random.rand(100, 10), dims=["nCells", "nVertLevels"]) mask = DataArray(np.zeros((100,), dtype="bool"), dims="nCells") actual = array.where(mask, drop=True) expected = DataArray(np.zeros((0, 10)), dims=["nCells", "nVertLevels"]) assert_identical(expected, actual) def test_where_drop_no_indexes(self) -> None: ds = Dataset({"foo": ("x", [0.0, 1.0])}) expected = Dataset({"foo": ("x", [1.0])}) actual = ds.where(ds == 1, drop=True) assert_identical(expected, actual) def test_reduce(self) -> None: data = create_test_data() assert len(data.mean().coords) == 0 actual = data.max() expected = Dataset({k: v.max() for k, v in data.data_vars.items()}) assert_equal(expected, actual) assert_equal(data.min(dim=["dim1"]), data.min(dim="dim1")) for reduct, expected_dims in [ ("dim2", ["dim3", "time", "dim1"]), (["dim2", "time"], ["dim3", "dim1"]), (("dim2", "time"), ["dim3", "dim1"]), ((), ["dim2", "dim3", "time", "dim1"]), ]: actual_dims = list(data.min(dim=reduct).dims) assert actual_dims == expected_dims assert_equal(data.mean(dim=[]), data) with pytest.raises(ValueError): data.mean(axis=0) def test_reduce_coords(self) -> None: # regression test for GH1470 data = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"b": 4}) expected = xr.Dataset({"a": 2}, coords={"b": 4}) actual = data.mean("x") assert_identical(actual, expected) # should be consistent actual = data["a"].mean("x").to_dataset() assert_identical(actual, expected) def test_mean_uint_dtype(self) -> None: data = xr.Dataset( { "a": (("x", "y"), np.arange(6).reshape(3, 2).astype("uint")), "b": (("x",), np.array([0.1, 0.2, np.nan])), } ) actual = data.mean("x", skipna=True) expected = xr.Dataset( {"a": data["a"].mean("x"), "b": data["b"].mean("x", skipna=True)} ) assert_identical(actual, expected) def test_reduce_bad_dim(self) -> None: data = create_test_data() with pytest.raises( ValueError, match=re.escape("Dimension(s) 'bad_dim' do not exist"), ): data.mean(dim="bad_dim") def test_reduce_cumsum(self) -> None: data = xr.Dataset( {"a": 1, "b": ("x", [1, 2]), "c": (("x", "y"), [[np.nan, 3], [0, 4]])} ) assert_identical(data.fillna(0), data.cumsum("y")) expected = xr.Dataset( {"a": 1, "b": ("x", [1, 3]), "c": (("x", "y"), [[0, 3], [0, 7]])} ) assert_identical(expected, data.cumsum()) @pytest.mark.parametrize( "reduct, expected", [ ("dim1", ["dim2", "dim3", "time", "dim1"]), ("dim2", ["dim3", "time", "dim1", "dim2"]), ("dim3", ["dim2", "time", "dim1", "dim3"]), ("time", ["dim2", "dim3", "dim1"]), ], ) @pytest.mark.parametrize("func", ["cumsum", "cumprod"]) def test_reduce_cumsum_test_dims(self, reduct, expected, func) -> None: data = create_test_data() with pytest.raises( ValueError, match=re.escape("Dimension(s) 'bad_dim' do not exist"), ): getattr(data, func)(dim="bad_dim") # ensure dimensions are correct actual = getattr(data, func)(dim=reduct).dims assert list(actual) == expected def test_reduce_non_numeric(self) -> None: data1 = create_test_data(seed=44, use_extension_array=True) data2 = create_test_data(seed=44) add_vars = {"var5": ["dim1", "dim2"], "var6": ["dim1"]} for v, dims in sorted(add_vars.items()): size = tuple(data1.sizes[d] for d in dims) data = np.random.randint(0, 100, size=size).astype(np.str_) data1[v] = (dims, data, {"foo": "variable"}) # var4 is extension array categorical and should be dropped assert ( "var4" not in data1.mean() and "var5" not in data1.mean() and "var6" not in data1.mean() ) assert_equal(data1.mean(), data2.mean()) assert_equal(data1.mean(dim="dim1"), data2.mean(dim="dim1")) assert "var5" not in data1.mean(dim="dim2") and "var6" in data1.mean(dim="dim2") @pytest.mark.filterwarnings( "ignore:Once the behaviour of DataArray:DeprecationWarning" ) def test_reduce_strings(self) -> None: expected = Dataset({"x": "a"}) ds = Dataset({"x": ("y", ["a", "b"])}) ds.coords["y"] = [-10, 10] actual = ds.min() assert_identical(expected, actual) expected = Dataset({"x": "b"}) actual = ds.max() assert_identical(expected, actual) expected = Dataset({"x": 0}) actual = ds.argmin() assert_identical(expected, actual) expected = Dataset({"x": 1}) actual = ds.argmax() assert_identical(expected, actual) expected = Dataset({"x": -10}) actual = ds.idxmin() assert_identical(expected, actual) expected = Dataset({"x": 10}) actual = ds.idxmax() assert_identical(expected, actual) expected = Dataset({"x": b"a"}) ds = Dataset({"x": ("y", np.array(["a", "b"], "S1"))}) actual = ds.min() assert_identical(expected, actual) expected = Dataset({"x": "a"}) ds = Dataset({"x": ("y", np.array(["a", "b"], "U1"))}) actual = ds.min() assert_identical(expected, actual) def test_reduce_dtypes(self) -> None: # regression test for GH342 expected = Dataset({"x": 1}) actual = Dataset({"x": True}).sum() assert_identical(expected, actual) # regression test for GH505 expected = Dataset({"x": 3}) actual = Dataset({"x": ("y", np.array([1, 2], "uint16"))}).sum() assert_identical(expected, actual) expected = Dataset({"x": 1 + 1j}) actual = Dataset({"x": ("y", [1, 1j])}).sum() assert_identical(expected, actual) def test_reduce_keep_attrs(self) -> None: data = create_test_data() _attrs = {"attr1": "value1", "attr2": 2929} attrs = dict(_attrs) data.attrs = attrs # Test dropped attrs ds = data.mean() assert ds.attrs == {} for v in ds.data_vars.values(): assert v.attrs == {} # Test kept attrs ds = data.mean(keep_attrs=True) assert ds.attrs == attrs for k, v in ds.data_vars.items(): assert v.attrs == data[k].attrs @pytest.mark.filterwarnings( "ignore:Once the behaviour of DataArray:DeprecationWarning" ) def test_reduce_argmin(self) -> None: # regression test for #205 ds = Dataset({"a": ("x", [0, 1])}) expected = Dataset({"a": ([], 0)}) actual = ds.argmin() assert_identical(expected, actual) actual = ds.argmin("x") assert_identical(expected, actual) def test_reduce_scalars(self) -> None: ds = Dataset({"x": ("a", [2, 2]), "y": 2, "z": ("b", [2])}) expected = Dataset({"x": 0, "y": 0, "z": 0}) actual = ds.var() assert_identical(expected, actual) expected = Dataset({"x": 0, "y": 0, "z": ("b", [0])}) actual = ds.var("a") assert_identical(expected, actual) def test_reduce_only_one_axis(self) -> None: def mean_only_one_axis(x, axis): if not isinstance(axis, integer_types): raise TypeError("non-integer axis") return x.mean(axis) ds = Dataset({"a": (["x", "y"], [[0, 1, 2, 3, 4]])}) expected = Dataset({"a": ("x", [2])}) actual = ds.reduce(mean_only_one_axis, "y") assert_identical(expected, actual) with pytest.raises( TypeError, match=r"missing 1 required positional argument: 'axis'" ): ds.reduce(mean_only_one_axis) def test_reduce_no_axis(self) -> None: def total_sum(x): return np.sum(x.flatten()) ds = Dataset({"a": (["x", "y"], [[0, 1, 2, 3, 4]])}) expected = Dataset({"a": ((), 10)}) actual = ds.reduce(total_sum) assert_identical(expected, actual) with pytest.raises(TypeError, match=r"unexpected keyword argument 'axis'"): ds.reduce(total_sum, dim="x") def test_reduce_keepdims(self) -> None: ds = Dataset( {"a": (["x", "y"], [[0, 1, 2, 3, 4]])}, coords={ "y": [0, 1, 2, 3, 4], "x": [0], "lat": (["x", "y"], [[0, 1, 2, 3, 4]]), "c": -999.0, }, ) # Shape should match behaviour of numpy reductions with keepdims=True # Coordinates involved in the reduction should be removed actual = ds.mean(keepdims=True) expected = Dataset( {"a": (["x", "y"], np.mean(ds.a, keepdims=True).data)}, coords={"c": ds.c} ) assert_identical(expected, actual) actual = ds.mean("x", keepdims=True) expected = Dataset( {"a": (["x", "y"], np.mean(ds.a, axis=0, keepdims=True).data)}, coords={"y": ds.y, "c": ds.c}, ) assert_identical(expected, actual) @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True) @pytest.mark.parametrize("skipna", [True, False, None]) @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]]) def test_quantile(self, q, skipna, compute_backend) -> None: ds = create_test_data(seed=123) ds.var1.data[0, 0] = np.nan for dim in [None, "dim1", ["dim1"]]: ds_quantile = ds.quantile(q, dim=dim, skipna=skipna) if is_scalar(q): assert "quantile" not in ds_quantile.dims else: assert "quantile" in ds_quantile.dims for var, dar in ds.data_vars.items(): assert var in ds_quantile assert_identical( ds_quantile[var], dar.quantile(q, dim=dim, skipna=skipna) ) dim = ["dim1", "dim2"] ds_quantile = ds.quantile(q, dim=dim, skipna=skipna) assert "dim3" in ds_quantile.dims assert all(d not in ds_quantile.dims for d in dim) @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True) @pytest.mark.parametrize("skipna", [True, False]) def test_quantile_skipna(self, skipna, compute_backend) -> None: q = 0.1 dim = "time" ds = Dataset({"a": ([dim], np.arange(0, 11))}) ds = ds.where(ds >= 1) result = ds.quantile(q=q, dim=dim, skipna=skipna) value = 1.9 if skipna else np.nan expected = Dataset({"a": value}, coords={"quantile": q}) assert_identical(result, expected) @pytest.mark.parametrize("method", ["midpoint", "lower"]) def test_quantile_method(self, method) -> None: ds = create_test_data(seed=123) q = [0.25, 0.5, 0.75] result = ds.quantile(q, method=method) assert_identical(result.var1, ds.var1.quantile(q, method=method)) assert_identical(result.var2, ds.var2.quantile(q, method=method)) assert_identical(result.var3, ds.var3.quantile(q, method=method)) @pytest.mark.parametrize("method", ["midpoint", "lower"]) def test_quantile_interpolation_deprecated(self, method) -> None: ds = create_test_data(seed=123) q = [0.25, 0.5, 0.75] with warnings.catch_warnings(record=True) as w: ds.quantile(q, interpolation=method) # ensure the warning is only raised once assert len(w) == 1 with warnings.catch_warnings(record=True): with pytest.raises(TypeError, match="interpolation and method keywords"): ds.quantile(q, method=method, interpolation=method) @requires_bottleneck def test_rank(self) -> None: ds = create_test_data(seed=1234) # only ds.var3 depends on dim3 z = ds.rank("dim3") assert ["var3"] == list(z.data_vars) # same as dataarray version x = z.var3 y = ds.var3.rank("dim3") assert_equal(x, y) # coordinates stick assert list(z.coords) == list(ds.coords) assert list(x.coords) == list(y.coords) # invalid dim with pytest.raises( ValueError, match=re.escape( "Dimension 'invalid_dim' not found in data dimensions ('dim3', 'dim1')" ), ): x.rank("invalid_dim") def test_rank_use_bottleneck(self) -> None: ds = Dataset({"a": ("x", [0, np.nan, 2]), "b": ("y", [4, 6, 3, 4])}) with xr.set_options(use_bottleneck=False): with pytest.raises(RuntimeError): ds.rank("x") def test_count(self) -> None: ds = Dataset({"x": ("a", [np.nan, 1]), "y": 0, "z": np.nan}) expected = Dataset({"x": 1, "y": 1, "z": 0}) actual = ds.count() assert_identical(expected, actual) def test_map(self) -> None: data = create_test_data() data.attrs["foo"] = "bar" assert_identical(data.map(np.mean), data.mean()) expected = data.mean(keep_attrs=True) actual = data.map(lambda x: x.mean(keep_attrs=True), keep_attrs=True) assert_identical(expected, actual) assert_identical(data.map(lambda x: x, keep_attrs=True), data.drop_vars("time")) def scale(x, multiple=1): return multiple * x actual = data.map(scale, multiple=2) assert_equal(actual["var1"], 2 * data["var1"]) assert_identical(actual["numbers"], data["numbers"]) actual = data.map(np.asarray) expected = data.drop_vars("time") # time is not used on a data var assert_equal(expected, actual) def test_apply_pending_deprecated_map(self) -> None: data = create_test_data() data.attrs["foo"] = "bar" with pytest.warns(PendingDeprecationWarning): assert_identical(data.apply(np.mean), data.mean()) def make_example_math_dataset(self): variables = { "bar": ("x", np.arange(100, 400, 100)), "foo": (("x", "y"), 1.0 * np.arange(12).reshape(3, 4)), } coords = {"abc": ("x", ["a", "b", "c"]), "y": 10 * np.arange(4)} ds = Dataset(variables, coords) ds["foo"][0, 0] = np.nan return ds def test_dataset_number_math(self) -> None: ds = self.make_example_math_dataset() assert_identical(ds, +ds) assert_identical(ds, ds + 0) assert_identical(ds, 0 + ds) assert_identical(ds, ds + np.array(0)) assert_identical(ds, np.array(0) + ds) actual = ds.copy(deep=True) actual += 0 assert_identical(ds, actual) def test_unary_ops(self) -> None: ds = self.make_example_math_dataset() assert_identical(ds.map(abs), abs(ds)) assert_identical(ds.map(lambda x: x + 4), ds + 4) for func in [ lambda x: x.isnull(), lambda x: x.round(), lambda x: x.astype(int), ]: assert_identical(ds.map(func), func(ds)) assert_identical(ds.isnull(), ~ds.notnull()) # don't actually patch these methods in with pytest.raises(AttributeError): _ = ds.item with pytest.raises(AttributeError): _ = ds.searchsorted def test_dataset_array_math(self) -> None: ds = self.make_example_math_dataset() expected = ds.map(lambda x: x - ds["foo"]) assert_identical(expected, ds - ds["foo"]) assert_identical(expected, -ds["foo"] + ds) assert_identical(expected, ds - ds["foo"].variable) assert_identical(expected, -ds["foo"].variable + ds) actual = ds.copy(deep=True) actual -= ds["foo"] assert_identical(expected, actual) expected = ds.map(lambda x: x + ds["bar"]) assert_identical(expected, ds + ds["bar"]) actual = ds.copy(deep=True) actual += ds["bar"] assert_identical(expected, actual) expected = Dataset({"bar": ds["bar"] + np.arange(3)}) assert_identical(expected, ds[["bar"]] + np.arange(3)) assert_identical(expected, np.arange(3) + ds[["bar"]]) def test_dataset_dataset_math(self) -> None: ds = self.make_example_math_dataset() assert_identical(ds, ds + 0 * ds) assert_identical(ds, ds + {"foo": 0, "bar": 0}) expected = ds.map(lambda x: 2 * x) assert_identical(expected, 2 * ds) assert_identical(expected, ds + ds) assert_identical(expected, ds + ds.data_vars) assert_identical(expected, ds + dict(ds.data_vars)) actual = ds.copy(deep=True) expected_id = id(actual) actual += ds assert_identical(expected, actual) assert expected_id == id(actual) assert_identical(ds == ds, ds.notnull()) subsampled = ds.isel(y=slice(2)) expected = 2 * subsampled assert_identical(expected, subsampled + ds) assert_identical(expected, ds + subsampled) def test_dataset_math_auto_align(self) -> None: ds = self.make_example_math_dataset() subset = ds.isel(y=[1, 3]) expected = 2 * subset actual = ds + subset assert_identical(expected, actual) actual = ds.isel(y=slice(1)) + ds.isel(y=slice(1, None)) expected = 2 * ds.drop_sel(y=ds.y) assert_equal(actual, expected) actual = ds + ds[["bar"]] expected = (2 * ds[["bar"]]).merge(ds.coords) assert_identical(expected, actual) assert_identical(ds + Dataset(), ds.coords.to_dataset()) assert_identical(Dataset() + Dataset(), Dataset()) ds2 = Dataset(coords={"bar": 42}) assert_identical(ds + ds2, ds.coords.merge(ds2)) # maybe unary arithmetic with empty datasets should raise instead? assert_identical(Dataset() + 1, Dataset()) actual = ds.copy(deep=True) other = ds.isel(y=slice(2)) actual += other expected = ds + other.reindex_like(ds) assert_identical(expected, actual) def test_dataset_math_errors(self) -> None: ds = self.make_example_math_dataset() with pytest.raises(TypeError): ds["foo"] += ds with pytest.raises(TypeError): ds["foo"].variable += ds with pytest.raises(ValueError, match=r"must have the same"): ds += ds[["bar"]] # verify we can rollback in-place operations if something goes wrong # nb. inplace datetime64 math actually will work with an integer array # but not floats thanks to numpy's inconsistent handling other = DataArray(np.datetime64("2000-01-01"), coords={"c": 2}) actual = ds.copy(deep=True) with pytest.raises(TypeError): actual += other assert_identical(actual, ds) def test_dataset_transpose(self) -> None: ds = Dataset( { "a": (("x", "y"), np.random.randn(3, 4)), "b": (("y", "x"), np.random.randn(4, 3)), }, coords={ "x": range(3), "y": range(4), "xy": (("x", "y"), np.random.randn(3, 4)), }, ) actual = ds.transpose() expected = Dataset( {"a": (("y", "x"), ds.a.values.T), "b": (("x", "y"), ds.b.values.T)}, coords={ "x": ds.x.values, "y": ds.y.values, "xy": (("y", "x"), ds.xy.values.T), }, ) assert_identical(expected, actual) actual = ds.transpose(...) expected = ds assert_identical(expected, actual) actual = ds.transpose("x", "y") expected = ds.map(lambda x: x.transpose("x", "y", transpose_coords=True)) assert_identical(expected, actual) ds = create_test_data() actual = ds.transpose() for k in ds.variables: assert actual[k].dims[::-1] == ds[k].dims new_order = ("dim2", "dim3", "dim1", "time") actual = ds.transpose(*new_order) for k in ds.variables: expected_dims = tuple(d for d in new_order if d in ds[k].dims) assert actual[k].dims == expected_dims # same as above but with ellipsis new_order = ("dim2", "dim3", "dim1", "time") actual = ds.transpose("dim2", "dim3", ...) for k in ds.variables: expected_dims = tuple(d for d in new_order if d in ds[k].dims) assert actual[k].dims == expected_dims # test missing dimension, raise error with pytest.raises(ValueError): ds.transpose(..., "not_a_dim") # test missing dimension, ignore error actual = ds.transpose(..., "not_a_dim", missing_dims="ignore") expected_ell = ds.transpose(...) assert_identical(expected_ell, actual) # test missing dimension, raise warning with pytest.warns(UserWarning): actual = ds.transpose(..., "not_a_dim", missing_dims="warn") assert_identical(expected_ell, actual) assert "T" not in dir(ds) def test_dataset_ellipsis_transpose_different_ordered_vars(self) -> None: # https://github.com/pydata/xarray/issues/1081#issuecomment-544350457 ds = Dataset( dict( a=(("w", "x", "y", "z"), np.ones((2, 3, 4, 5))), b=(("x", "w", "y", "z"), np.zeros((3, 2, 4, 5))), ) ) result = ds.transpose(..., "z", "y") assert list(result["a"].dims) == list("wxzy") assert list(result["b"].dims) == list("xwzy") def test_dataset_retains_period_index_on_transpose(self) -> None: ds = create_test_data() ds["time"] = pd.period_range("2000-01-01", periods=20) transposed = ds.transpose() assert isinstance(transposed.time.to_index(), pd.PeriodIndex) def test_dataset_diff_n1_simple(self) -> None: ds = Dataset({"foo": ("x", [5, 5, 6, 6])}) actual = ds.diff("x") expected = Dataset({"foo": ("x", [0, 1, 0])}) assert_equal(expected, actual) def test_dataset_diff_n1_label(self) -> None: ds = Dataset({"foo": ("x", [5, 5, 6, 6])}, {"x": [0, 1, 2, 3]}) actual = ds.diff("x", label="lower") expected = Dataset({"foo": ("x", [0, 1, 0])}, {"x": [0, 1, 2]}) assert_equal(expected, actual) actual = ds.diff("x", label="upper") expected = Dataset({"foo": ("x", [0, 1, 0])}, {"x": [1, 2, 3]}) assert_equal(expected, actual) def test_dataset_diff_n1(self) -> None: ds = create_test_data(seed=1) actual = ds.diff("dim2") expected_dict = {} expected_dict["var1"] = DataArray( np.diff(ds["var1"].values, axis=1), {"dim2": ds["dim2"].values[1:]}, ["dim1", "dim2"], ) expected_dict["var2"] = DataArray( np.diff(ds["var2"].values, axis=1), {"dim2": ds["dim2"].values[1:]}, ["dim1", "dim2"], ) expected_dict["var3"] = ds["var3"] expected = Dataset(expected_dict, coords={"time": ds["time"].values}) expected.coords["numbers"] = ("dim3", ds["numbers"].values) assert_equal(expected, actual) def test_dataset_diff_n2(self) -> None: ds = create_test_data(seed=1) actual = ds.diff("dim2", n=2) expected_dict = {} expected_dict["var1"] = DataArray( np.diff(ds["var1"].values, axis=1, n=2), {"dim2": ds["dim2"].values[2:]}, ["dim1", "dim2"], ) expected_dict["var2"] = DataArray( np.diff(ds["var2"].values, axis=1, n=2), {"dim2": ds["dim2"].values[2:]}, ["dim1", "dim2"], ) expected_dict["var3"] = ds["var3"] expected = Dataset(expected_dict, coords={"time": ds["time"].values}) expected.coords["numbers"] = ("dim3", ds["numbers"].values) assert_equal(expected, actual) def test_dataset_diff_exception_n_neg(self) -> None: ds = create_test_data(seed=1) with pytest.raises(ValueError, match=r"must be non-negative"): ds.diff("dim2", n=-1) def test_dataset_diff_exception_label_str(self) -> None: ds = create_test_data(seed=1) with pytest.raises(ValueError, match=r"'label' argument has to"): ds.diff("dim2", label="raise_me") # type: ignore[arg-type] @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"foo": -10}]) def test_shift(self, fill_value) -> None: coords = {"bar": ("x", list("abc")), "x": [-4, 3, 2]} attrs = {"meta": "data"} ds = Dataset({"foo": ("x", [1, 2, 3])}, coords, attrs) actual = ds.shift(x=1, fill_value=fill_value) if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array fill_value = np.nan elif isinstance(fill_value, dict): fill_value = fill_value.get("foo", np.nan) expected = Dataset({"foo": ("x", [fill_value, 1, 2])}, coords, attrs) assert_identical(expected, actual) with pytest.raises(ValueError, match=r"dimensions"): ds.shift(foo=123) def test_roll_coords(self) -> None: coords = {"bar": ("x", list("abc")), "x": [-4, 3, 2]} attrs = {"meta": "data"} ds = Dataset({"foo": ("x", [1, 2, 3])}, coords, attrs) actual = ds.roll(x=1, roll_coords=True) ex_coords = {"bar": ("x", list("cab")), "x": [2, -4, 3]} expected = Dataset({"foo": ("x", [3, 1, 2])}, ex_coords, attrs) assert_identical(expected, actual) with pytest.raises(ValueError, match=r"dimensions"): ds.roll(foo=123, roll_coords=True) def test_roll_no_coords(self) -> None: coords = {"bar": ("x", list("abc")), "x": [-4, 3, 2]} attrs = {"meta": "data"} ds = Dataset({"foo": ("x", [1, 2, 3])}, coords, attrs) actual = ds.roll(x=1) expected = Dataset({"foo": ("x", [3, 1, 2])}, coords, attrs) assert_identical(expected, actual) with pytest.raises(ValueError, match=r"dimensions"): ds.roll(abc=321) def test_roll_multidim(self) -> None: # regression test for 2445 arr = xr.DataArray( [[1, 2, 3], [4, 5, 6]], coords={"x": range(3), "y": range(2)}, dims=("y", "x"), ) actual = arr.roll(x=1, roll_coords=True) expected = xr.DataArray( [[3, 1, 2], [6, 4, 5]], coords=[("y", [0, 1]), ("x", [2, 0, 1])] ) assert_identical(expected, actual) def test_real_and_imag(self) -> None: attrs = {"foo": "bar"} ds = Dataset({"x": ((), 1 + 2j, attrs)}, attrs=attrs) expected_re = Dataset({"x": ((), 1, attrs)}, attrs=attrs) assert_identical(ds.real, expected_re) expected_im = Dataset({"x": ((), 2, attrs)}, attrs=attrs) assert_identical(ds.imag, expected_im) def test_setattr_raises(self) -> None: ds = Dataset({}, coords={"scalar": 1}, attrs={"foo": "bar"}) with pytest.raises(AttributeError, match=r"cannot set attr"): ds.scalar = 2 with pytest.raises(AttributeError, match=r"cannot set attr"): ds.foo = 2 with pytest.raises(AttributeError, match=r"cannot set attr"): ds.other = 2 def test_filter_by_attrs(self) -> None: precip = dict(standard_name="convective_precipitation_flux") temp0 = dict(standard_name="air_potential_temperature", height="0 m") temp10 = dict(standard_name="air_potential_temperature", height="10 m") ds = Dataset( { "temperature_0": (["t"], [0], temp0), "temperature_10": (["t"], [0], temp10), "precipitation": (["t"], [0], precip), }, coords={"time": (["t"], [0], dict(axis="T", long_name="time_in_seconds"))}, ) # Test return empty Dataset. ds.filter_by_attrs(standard_name="invalid_standard_name") new_ds = ds.filter_by_attrs(standard_name="invalid_standard_name") assert not bool(new_ds.data_vars) # Test return one DataArray. new_ds = ds.filter_by_attrs(standard_name="convective_precipitation_flux") assert new_ds["precipitation"].standard_name == "convective_precipitation_flux" assert_equal(new_ds["precipitation"], ds["precipitation"]) # Test filter coordinates new_ds = ds.filter_by_attrs(long_name="time_in_seconds") assert new_ds["time"].long_name == "time_in_seconds" assert not bool(new_ds.data_vars) # Test return more than one DataArray. new_ds = ds.filter_by_attrs(standard_name="air_potential_temperature") assert len(new_ds.data_vars) == 2 for var in new_ds.data_vars: assert new_ds[var].standard_name == "air_potential_temperature" # Test callable. new_ds = ds.filter_by_attrs(height=lambda v: v is not None) assert len(new_ds.data_vars) == 2 for var in new_ds.data_vars: assert new_ds[var].standard_name == "air_potential_temperature" new_ds = ds.filter_by_attrs(height="10 m") assert len(new_ds.data_vars) == 1 for var in new_ds.data_vars: assert new_ds[var].height == "10 m" # Test return empty Dataset due to conflicting filters new_ds = ds.filter_by_attrs( standard_name="convective_precipitation_flux", height="0 m" ) assert not bool(new_ds.data_vars) # Test return one DataArray with two filter conditions new_ds = ds.filter_by_attrs( standard_name="air_potential_temperature", height="0 m" ) for var in new_ds.data_vars: assert new_ds[var].standard_name == "air_potential_temperature" assert new_ds[var].height == "0 m" assert new_ds[var].height != "10 m" # Test return empty Dataset due to conflicting callables new_ds = ds.filter_by_attrs( standard_name=lambda v: False, height=lambda v: True ) assert not bool(new_ds.data_vars) def test_binary_op_propagate_indexes(self) -> None: ds = Dataset( {"d1": DataArray([1, 2, 3], dims=["x"], coords={"x": [10, 20, 30]})} ) expected = ds.xindexes["x"] actual = (ds * 2).xindexes["x"] assert expected is actual def test_binary_op_join_setting(self) -> None: # arithmetic_join applies to data array coordinates missing_2 = xr.Dataset({"x": [0, 1]}) missing_0 = xr.Dataset({"x": [1, 2]}) with xr.set_options(arithmetic_join="outer"): actual = missing_2 + missing_0 expected = xr.Dataset({"x": [0, 1, 2]}) assert_equal(actual, expected) # arithmetic join also applies to data_vars ds1 = xr.Dataset({"foo": 1, "bar": 2}) ds2 = xr.Dataset({"bar": 2, "baz": 3}) expected = xr.Dataset({"bar": 4}) # default is inner joining actual = ds1 + ds2 assert_equal(actual, expected) with xr.set_options(arithmetic_join="outer"): expected = xr.Dataset({"foo": np.nan, "bar": 4, "baz": np.nan}) actual = ds1 + ds2 assert_equal(actual, expected) with xr.set_options(arithmetic_join="left"): expected = xr.Dataset({"foo": np.nan, "bar": 4}) actual = ds1 + ds2 assert_equal(actual, expected) with xr.set_options(arithmetic_join="right"): expected = xr.Dataset({"bar": 4, "baz": np.nan}) actual = ds1 + ds2 assert_equal(actual, expected) @pytest.mark.parametrize( ["keep_attrs", "expected"], ( pytest.param(False, {}, id="False"), pytest.param(True, {"foo": "a", "bar": "b"}, id="True"), ), ) def test_binary_ops_keep_attrs(self, keep_attrs, expected) -> None: ds1 = xr.Dataset({"a": 1}, attrs={"foo": "a", "bar": "b"}) ds2 = xr.Dataset({"a": 1}, attrs={"foo": "a", "baz": "c"}) with xr.set_options(keep_attrs=keep_attrs): ds_result = ds1 + ds2 assert ds_result.attrs == expected def test_full_like(self) -> None: # For more thorough tests, see test_variable.py # Note: testing data_vars with mismatched dtypes ds = Dataset( { "d1": DataArray([1, 2, 3], dims=["x"], coords={"x": [10, 20, 30]}), "d2": DataArray([1.1, 2.2, 3.3], dims=["y"]), }, attrs={"foo": "bar"}, ) actual = full_like(ds, 2) expected = ds.copy(deep=True) # https://github.com/python/mypy/issues/3004 expected["d1"].values = [2, 2, 2] # type: ignore[assignment] expected["d2"].values = [2.0, 2.0, 2.0] # type: ignore[assignment] assert expected["d1"].dtype == int assert expected["d2"].dtype == float assert_identical(expected, actual) # override dtype actual = full_like(ds, fill_value=True, dtype=bool) expected = ds.copy(deep=True) expected["d1"].values = [True, True, True] # type: ignore[assignment] expected["d2"].values = [True, True, True] # type: ignore[assignment] assert expected["d1"].dtype == bool assert expected["d2"].dtype == bool assert_identical(expected, actual) # with multiple fill values actual = full_like(ds, {"d1": 1, "d2": 2.3}) expected = ds.assign(d1=("x", [1, 1, 1]), d2=("y", [2.3, 2.3, 2.3])) assert expected["d1"].dtype == int assert expected["d2"].dtype == float assert_identical(expected, actual) # override multiple dtypes actual = full_like(ds, fill_value={"d1": 1, "d2": 2.3}, dtype={"d1": bool}) expected = ds.assign(d1=("x", [True, True, True]), d2=("y", [2.3, 2.3, 2.3])) assert expected["d1"].dtype == bool assert expected["d2"].dtype == float assert_identical(expected, actual) def test_combine_first(self) -> None: dsx0 = DataArray([0, 0], [("x", ["a", "b"])]).to_dataset(name="dsx0") dsx1 = DataArray([1, 1], [("x", ["b", "c"])]).to_dataset(name="dsx1") actual = dsx0.combine_first(dsx1) expected = Dataset( {"dsx0": ("x", [0, 0, np.nan]), "dsx1": ("x", [np.nan, 1, 1])}, coords={"x": ["a", "b", "c"]}, ) assert_equal(actual, expected) assert_equal(actual, xr.merge([dsx0, dsx1])) # works just like xr.merge([self, other]) dsy2 = DataArray([2, 2, 2], [("x", ["b", "c", "d"])]).to_dataset(name="dsy2") actual = dsx0.combine_first(dsy2) expected = xr.merge([dsy2, dsx0]) assert_equal(actual, expected) def test_sortby(self) -> None: ds = Dataset( { "A": DataArray( [[1, 2], [3, 4], [5, 6]], [("x", ["c", "b", "a"]), ("y", [1, 0])] ), "B": DataArray([[5, 6], [7, 8], [9, 10]], dims=["x", "y"]), } ) sorted1d = Dataset( { "A": DataArray( [[5, 6], [3, 4], [1, 2]], [("x", ["a", "b", "c"]), ("y", [1, 0])] ), "B": DataArray([[9, 10], [7, 8], [5, 6]], dims=["x", "y"]), } ) sorted2d = Dataset( { "A": DataArray( [[6, 5], [4, 3], [2, 1]], [("x", ["a", "b", "c"]), ("y", [0, 1])] ), "B": DataArray([[10, 9], [8, 7], [6, 5]], dims=["x", "y"]), } ) expected = sorted1d dax = DataArray([100, 99, 98], [("x", ["c", "b", "a"])]) actual = ds.sortby(dax) assert_equal(actual, expected) # test descending order sort actual = ds.sortby(dax, ascending=False) assert_equal(actual, ds) # test alignment (fills in nan for 'c') dax_short = DataArray([98, 97], [("x", ["b", "a"])]) actual = ds.sortby(dax_short) assert_equal(actual, expected) # test 1-D lexsort # dax0 is sorted first to give indices of [1, 2, 0] # and then dax1 would be used to move index 2 ahead of 1 dax0 = DataArray([100, 95, 95], [("x", ["c", "b", "a"])]) dax1 = DataArray([0, 1, 0], [("x", ["c", "b", "a"])]) actual = ds.sortby([dax0, dax1]) # lexsort underneath gives [2, 1, 0] assert_equal(actual, expected) expected = sorted2d # test multi-dim sort by 1D dataarray values day = DataArray([90, 80], [("y", [1, 0])]) actual = ds.sortby([day, dax]) assert_equal(actual, expected) # test exception-raising with pytest.raises(KeyError): actual = ds.sortby("z") with pytest.raises(ValueError) as excinfo: actual = ds.sortby(ds["A"]) assert "DataArray is not 1-D" in str(excinfo.value) expected = sorted1d actual = ds.sortby("x") assert_equal(actual, expected) # test pandas.MultiIndex indices = (("b", 1), ("b", 0), ("a", 1), ("a", 0)) midx = pd.MultiIndex.from_tuples(indices, names=["one", "two"]) ds_midx = Dataset( { "A": DataArray( [[1, 2], [3, 4], [5, 6], [7, 8]], [("x", midx), ("y", [1, 0])] ), "B": DataArray([[5, 6], [7, 8], [9, 10], [11, 12]], dims=["x", "y"]), } ) actual = ds_midx.sortby("x") midx_reversed = pd.MultiIndex.from_tuples( tuple(reversed(indices)), names=["one", "two"] ) expected = Dataset( { "A": DataArray( [[7, 8], [5, 6], [3, 4], [1, 2]], [("x", midx_reversed), ("y", [1, 0])], ), "B": DataArray([[11, 12], [9, 10], [7, 8], [5, 6]], dims=["x", "y"]), } ) assert_equal(actual, expected) # multi-dim sort by coordinate objects expected = sorted2d actual = ds.sortby(["x", "y"]) assert_equal(actual, expected) # test descending order sort actual = ds.sortby(["x", "y"], ascending=False) assert_equal(actual, ds) def test_attribute_access(self) -> None: ds = create_test_data(seed=1) for key in ["var1", "var2", "var3", "time", "dim1", "dim2", "dim3", "numbers"]: assert_equal(ds[key], getattr(ds, key)) assert key in dir(ds) for key in ["dim3", "dim1", "numbers"]: assert_equal(ds["var3"][key], getattr(ds.var3, key)) assert key in dir(ds["var3"]) # attrs assert ds["var3"].attrs["foo"] == ds.var3.foo assert "foo" in dir(ds["var3"]) def test_ipython_key_completion(self) -> None: ds = create_test_data(seed=1) actual = ds._ipython_key_completions_() expected = ["var1", "var2", "var3", "time", "dim1", "dim2", "dim3", "numbers"] for item in actual: ds[item] # should not raise assert sorted(actual) == sorted(expected) # for dataarray actual = ds["var3"]._ipython_key_completions_() expected = ["dim3", "dim1", "numbers"] for item in actual: ds["var3"][item] # should not raise assert sorted(actual) == sorted(expected) # MultiIndex ds_midx = ds.stack(dim12=["dim2", "dim3"]) actual = ds_midx._ipython_key_completions_() expected = [ "var1", "var2", "var3", "time", "dim1", "dim2", "dim3", "numbers", "dim12", ] for item in actual: ds_midx[item] # should not raise assert sorted(actual) == sorted(expected) # coords actual = ds.coords._ipython_key_completions_() expected = ["time", "dim1", "dim2", "dim3", "numbers"] for item in actual: ds.coords[item] # should not raise assert sorted(actual) == sorted(expected) actual = ds["var3"].coords._ipython_key_completions_() expected = ["dim1", "dim3", "numbers"] for item in actual: ds["var3"].coords[item] # should not raise assert sorted(actual) == sorted(expected) coords = Coordinates(ds.coords) actual = coords._ipython_key_completions_() expected = ["time", "dim2", "dim3", "numbers"] for item in actual: coords[item] # should not raise assert sorted(actual) == sorted(expected) # data_vars actual = ds.data_vars._ipython_key_completions_() expected = ["var1", "var2", "var3", "dim1"] for item in actual: ds.data_vars[item] # should not raise assert sorted(actual) == sorted(expected) def test_polyfit_output(self) -> None: ds = create_test_data(seed=1) out = ds.polyfit("dim2", 2, full=False) assert "var1_polyfit_coefficients" in out out = ds.polyfit("dim1", 2, full=True) assert "var1_polyfit_coefficients" in out assert "dim1_matrix_rank" in out out = ds.polyfit("time", 2) assert len(out.data_vars) == 0 def test_polyfit_weighted(self) -> None: ds = create_test_data(seed=1) ds = ds.broadcast_like(ds) # test more than 2 dimensions (issue #9972) ds_copy = ds.copy(deep=True) expected = ds.polyfit("dim2", 2) actual = ds.polyfit("dim2", 2, w=np.ones(ds.sizes["dim2"])) xr.testing.assert_identical(expected, actual) # Make sure weighted polyfit does not change the original object (issue #5644) xr.testing.assert_identical(ds, ds_copy) def test_polyfit_coord(self) -> None: # Make sure polyfit works when given a non-dimension coordinate. ds = create_test_data(seed=1) out = ds.polyfit("numbers", 2, full=False) assert "var3_polyfit_coefficients" in out assert "dim1" in out.dims assert "dim2" not in out assert "dim3" not in out def test_polyfit_coord_output(self) -> None: da = xr.DataArray( [1, 3, 2], dims=["x"], coords=dict(x=["a", "b", "c"], y=("x", [0, 1, 2])) ) out = da.polyfit("y", deg=1)["polyfit_coefficients"] assert out.sel(degree=0).item() == pytest.approx(1.5) assert out.sel(degree=1).item() == pytest.approx(0.5) def test_polyfit_warnings(self) -> None: ds = create_test_data(seed=1) with warnings.catch_warnings(record=True) as ws: ds.var1.polyfit("dim2", 10, full=False) assert len(ws) == 1 assert ws[0].category == RankWarning ds.var1.polyfit("dim2", 10, full=True) assert len(ws) == 1 def test_polyfit_polyval(self) -> None: da = xr.DataArray( np.arange(1, 10).astype(np.float64), dims=["x"], coords=dict(x=np.arange(9)) ) out = da.polyfit("x", 3, full=False) da_fitval = xr.polyval(da.x, out.polyfit_coefficients) # polyval introduces very small errors (1e-16 here) xr.testing.assert_allclose(da_fitval, da) da = da.assign_coords(x=xr.date_range("2001-01-01", periods=9, freq="YS")) out = da.polyfit("x", 3, full=False) da_fitval = xr.polyval(da.x, out.polyfit_coefficients) xr.testing.assert_allclose(da_fitval, da, rtol=1e-3) @requires_cftime def test_polyfit_polyval_cftime(self) -> None: da = xr.DataArray( np.arange(1, 10).astype(np.float64), dims=["x"], coords=dict( x=xr.date_range("2001-01-01", periods=9, freq="YS", calendar="noleap") ), ) out = da.polyfit("x", 3, full=False) da_fitval = xr.polyval(da.x, out.polyfit_coefficients) np.testing.assert_allclose(da_fitval, da) @staticmethod def _test_data_var_interior( original_data_var, padded_data_var, padded_dim_name, expected_pad_values ): np.testing.assert_equal( np.unique(padded_data_var.isel({padded_dim_name: [0, -1]})), expected_pad_values, ) np.testing.assert_array_equal( padded_data_var.isel({padded_dim_name: slice(1, -1)}), original_data_var ) @pytest.mark.parametrize("padded_dim_name", ["dim1", "dim2", "dim3", "time"]) @pytest.mark.parametrize( ["constant_values"], [ pytest.param(None, id="default"), pytest.param(42, id="scalar"), pytest.param((42, 43), id="tuple"), pytest.param({"dim1": 42, "dim2": 43}, id="per dim scalar"), pytest.param({"dim1": (42, 43), "dim2": (43, 44)}, id="per dim tuple"), pytest.param({"var1": 42, "var2": (42, 43)}, id="per var"), pytest.param({"var1": 42, "dim1": (42, 43)}, id="mixed"), ], ) def test_pad(self, padded_dim_name, constant_values) -> None: ds = create_test_data(seed=1) padded = ds.pad({padded_dim_name: (1, 1)}, constant_values=constant_values) # test padded dim values and size for ds_dim_name, ds_dim in ds.sizes.items(): if ds_dim_name == padded_dim_name: np.testing.assert_equal(padded.sizes[ds_dim_name], ds_dim + 2) if ds_dim_name in padded.coords: assert padded[ds_dim_name][[0, -1]].isnull().all() else: np.testing.assert_equal(padded.sizes[ds_dim_name], ds_dim) # check if coord "numbers" with dimension dim3 is padded correctly if padded_dim_name == "dim3": assert padded["numbers"][[0, -1]].isnull().all() # twarning: passes but dtype changes from int to float np.testing.assert_array_equal(padded["numbers"][1:-1], ds["numbers"]) # test if data_vars are paded with correct values for data_var_name, data_var in padded.data_vars.items(): if padded_dim_name in data_var.dims: if utils.is_dict_like(constant_values): if ( expected := constant_values.get(data_var_name, None) ) is not None: self._test_data_var_interior( ds[data_var_name], data_var, padded_dim_name, expected ) elif ( expected := constant_values.get(padded_dim_name, None) ) is not None: self._test_data_var_interior( ds[data_var_name], data_var, padded_dim_name, expected ) else: self._test_data_var_interior( ds[data_var_name], data_var, padded_dim_name, 0 ) elif constant_values: self._test_data_var_interior( ds[data_var_name], data_var, padded_dim_name, constant_values ) else: self._test_data_var_interior( ds[data_var_name], data_var, padded_dim_name, np.nan ) else: assert_array_equal(data_var, ds[data_var_name]) @pytest.mark.parametrize( ["keep_attrs", "attrs", "expected"], [ pytest.param(None, {"a": 1, "b": 2}, {"a": 1, "b": 2}, id="default"), pytest.param(False, {"a": 1, "b": 2}, {}, id="False"), pytest.param(True, {"a": 1, "b": 2}, {"a": 1, "b": 2}, id="True"), ], ) def test_pad_keep_attrs(self, keep_attrs, attrs, expected) -> None: ds = xr.Dataset( {"a": ("x", [1, 2], attrs), "b": ("y", [1, 2], attrs)}, coords={"c": ("x", [-1, 1], attrs), "d": ("y", [-1, 1], attrs)}, attrs=attrs, ) expected = xr.Dataset( {"a": ("x", [0, 1, 2, 0], expected), "b": ("y", [1, 2], attrs)}, coords={ "c": ("x", [np.nan, -1, 1, np.nan], expected), "d": ("y", [-1, 1], attrs), }, attrs=expected, ) keep_attrs_ = "default" if keep_attrs is None else keep_attrs with set_options(keep_attrs=keep_attrs_): actual = ds.pad({"x": (1, 1)}, mode="constant", constant_values=0) xr.testing.assert_identical(actual, expected) actual = ds.pad( {"x": (1, 1)}, mode="constant", constant_values=0, keep_attrs=keep_attrs ) xr.testing.assert_identical(actual, expected) def test_astype_attrs(self) -> None: data = create_test_data(seed=123) data.attrs["foo"] = "bar" assert data.attrs == data.astype(float).attrs assert data.var1.attrs == data.astype(float).var1.attrs assert not data.astype(float, keep_attrs=False).attrs assert not data.astype(float, keep_attrs=False).var1.attrs @pytest.mark.parametrize("parser", ["pandas", "python"]) @pytest.mark.parametrize( "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])] ) @pytest.mark.parametrize( "backend", ["numpy", pytest.param("dask", marks=[requires_dask])] ) def test_query(self, backend, engine, parser) -> None: """Test querying a dataset.""" # setup test data np.random.seed(42) a = np.arange(0, 10, 1) b = np.random.randint(0, 100, size=10) c = np.linspace(0, 1, 20) d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( object ) e = np.arange(0, 10 * 20).reshape(10, 20) f = np.random.normal(0, 1, size=(10, 20, 30)) if backend == "numpy": ds = Dataset( { "a": ("x", a), "b": ("x", b), "c": ("y", c), "d": ("z", d), "e": (("x", "y"), e), "f": (("x", "y", "z"), f), }, coords={ "a2": ("x", a), "b2": ("x", b), "c2": ("y", c), "d2": ("z", d), "e2": (("x", "y"), e), "f2": (("x", "y", "z"), f), }, ) elif backend == "dask": ds = Dataset( { "a": ("x", da.from_array(a, chunks=3)), "b": ("x", da.from_array(b, chunks=3)), "c": ("y", da.from_array(c, chunks=7)), "d": ("z", da.from_array(d, chunks=12)), "e": (("x", "y"), da.from_array(e, chunks=(3, 7))), "f": (("x", "y", "z"), da.from_array(f, chunks=(3, 7, 12))), }, coords={ "a2": ("x", a), "b2": ("x", b), "c2": ("y", c), "d2": ("z", d), "e2": (("x", "y"), e), "f2": (("x", "y", "z"), f), }, ) # query single dim, single variable with raise_if_dask_computes(): actual = ds.query(x="a2 > 5", engine=engine, parser=parser) expect = ds.isel(x=(a > 5)) assert_identical(expect, actual) # query single dim, single variable, via dict with raise_if_dask_computes(): actual = ds.query(dict(x="a2 > 5"), engine=engine, parser=parser) expect = ds.isel(dict(x=(a > 5))) assert_identical(expect, actual) # query single dim, single variable with raise_if_dask_computes(): actual = ds.query(x="b2 > 50", engine=engine, parser=parser) expect = ds.isel(x=(b > 50)) assert_identical(expect, actual) # query single dim, single variable with raise_if_dask_computes(): actual = ds.query(y="c2 < .5", engine=engine, parser=parser) expect = ds.isel(y=(c < 0.5)) assert_identical(expect, actual) # query single dim, single string variable if parser == "pandas": # N.B., this query currently only works with the pandas parser # xref https://github.com/pandas-dev/pandas/issues/40436 with raise_if_dask_computes(): actual = ds.query(z='d2 == "bar"', engine=engine, parser=parser) expect = ds.isel(z=(d == "bar")) assert_identical(expect, actual) # query single dim, multiple variables with raise_if_dask_computes(): actual = ds.query(x="(a2 > 5) & (b2 > 50)", engine=engine, parser=parser) expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) # query single dim, multiple variables with computation with raise_if_dask_computes(): actual = ds.query(x="(a2 * b2) > 250", engine=engine, parser=parser) expect = ds.isel(x=(a * b) > 250) assert_identical(expect, actual) # check pandas query syntax is supported if parser == "pandas": with raise_if_dask_computes(): actual = ds.query( x="(a2 > 5) and (b2 > 50)", engine=engine, parser=parser ) expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) # query multiple dims via kwargs with raise_if_dask_computes(): actual = ds.query(x="a2 > 5", y="c2 < .5", engine=engine, parser=parser) expect = ds.isel(x=(a > 5), y=(c < 0.5)) assert_identical(expect, actual) # query multiple dims via kwargs if parser == "pandas": with raise_if_dask_computes(): actual = ds.query( x="a2 > 5", y="c2 < .5", z="d2 == 'bar'", engine=engine, parser=parser, ) expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(d == "bar")) assert_identical(expect, actual) # query multiple dims via dict with raise_if_dask_computes(): actual = ds.query( dict(x="a2 > 5", y="c2 < .5"), engine=engine, parser=parser ) expect = ds.isel(dict(x=(a > 5), y=(c < 0.5))) assert_identical(expect, actual) # query multiple dims via dict if parser == "pandas": with raise_if_dask_computes(): actual = ds.query( dict(x="a2 > 5", y="c2 < .5", z="d2 == 'bar'"), engine=engine, parser=parser, ) expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(d == "bar"))) assert_identical(expect, actual) # test error handling with pytest.raises(ValueError): ds.query("a > 5") # type: ignore[arg-type] # must be dict or kwargs with pytest.raises(ValueError): ds.query(x=(a > 5)) with pytest.raises(IndexError): ds.query(y="a > 5") # wrong length dimension with pytest.raises(IndexError): ds.query(x="c < .5") # wrong length dimension with pytest.raises(IndexError): ds.query(x="e > 100") # wrong number of dimensions with pytest.raises(UndefinedVariableError): ds.query(x="spam > 50") # name not present # pytest tests — new tests should go here, rather than in the class. @pytest.mark.parametrize("parser", ["pandas", "python"]) def test_eval(ds, parser) -> None: """Currently much more minimal testing that `query` above, and much of the setup isn't used. But the risks are fairly low — `query` shares much of the code, and the method is currently experimental.""" actual = ds.eval("z1 + 5", parser=parser) expect = ds["z1"] + 5 assert_identical(expect, actual) # check pandas query syntax is supported if parser == "pandas": actual = ds.eval("(z1 > 5) and (z2 > 0)", parser=parser) expect = (ds["z1"] > 5) & (ds["z2"] > 0) assert_identical(expect, actual) @pytest.mark.parametrize("test_elements", ([1, 2], np.array([1, 2]), DataArray([1, 2]))) def test_isin(test_elements, backend) -> None: expected = Dataset( data_vars={ "var1": (("dim1",), [0, 1]), "var2": (("dim1",), [1, 1]), "var3": (("dim1",), [0, 1]), } ).astype("bool") if backend == "dask": expected = expected.chunk() result = Dataset( data_vars={ "var1": (("dim1",), [0, 1]), "var2": (("dim1",), [1, 2]), "var3": (("dim1",), [0, 1]), } ).isin(test_elements) assert_equal(result, expected) def test_isin_dataset() -> None: ds = Dataset({"x": [1, 2]}) with pytest.raises(TypeError): ds.isin(ds) @pytest.mark.parametrize( "unaligned_coords", ( {"x": [2, 1, 0]}, {"x": (["x"], np.asarray([2, 1, 0]))}, {"x": (["x"], np.asarray([1, 2, 0]))}, {"x": pd.Index([2, 1, 0])}, {"x": Variable(dims="x", data=[0, 2, 1])}, {"x": IndexVariable(dims="x", data=[0, 1, 2])}, {"y": 42}, {"y": ("x", [2, 1, 0])}, {"y": ("x", np.asarray([2, 1, 0]))}, {"y": (["x"], np.asarray([2, 1, 0]))}, ), ) @pytest.mark.parametrize("coords", ({"x": ("x", [0, 1, 2])}, {"x": [0, 1, 2]})) def test_dataset_constructor_aligns_to_explicit_coords( unaligned_coords, coords ) -> None: a = xr.DataArray([1, 2, 3], dims=["x"], coords=unaligned_coords) expected = xr.Dataset(coords=coords) expected["a"] = a result = xr.Dataset({"a": a}, coords=coords) assert_equal(expected, result) def test_error_message_on_set_supplied() -> None: with pytest.raises(TypeError, match="has invalid type "): xr.Dataset(dict(date=[1, 2, 3], sec={4})) @pytest.mark.parametrize("unaligned_coords", ({"y": ("b", np.asarray([2, 1, 0]))},)) def test_constructor_raises_with_invalid_coords(unaligned_coords) -> None: with pytest.raises(ValueError, match="not a subset of the DataArray dimensions"): xr.DataArray([1, 2, 3], dims=["x"], coords=unaligned_coords) @pytest.mark.parametrize("ds", [3], indirect=True) def test_dir_expected_attrs(ds) -> None: some_expected_attrs = {"pipe", "mean", "isnull", "var1", "dim2", "numbers"} result = dir(ds) assert set(result) >= some_expected_attrs def test_dir_non_string(ds) -> None: # add a numbered key to ensure this doesn't break dir ds[5] = "foo" result = dir(ds) assert 5 not in result # GH2172 sample_data = np.random.uniform(size=[2, 2000, 10000]) x = xr.Dataset({"sample_data": (sample_data.shape, sample_data)}) x2 = x["sample_data"] dir(x2) def test_dir_unicode(ds) -> None: ds["unicode"] = "uni" result = dir(ds) assert "unicode" in result def test_raise_no_warning_for_nan_in_binary_ops() -> None: with assert_no_warnings(): _ = Dataset(data_vars={"x": ("y", [1, 2, np.nan])}) > 0 @pytest.mark.filterwarnings("error") @pytest.mark.parametrize("ds", (2,), indirect=True) def test_raise_no_warning_assert_close(ds) -> None: assert_allclose(ds, ds) @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("edge_order", [1, 2]) def test_differentiate(dask, edge_order) -> None: rs = np.random.default_rng(42) coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8] da = xr.DataArray( rs.random((8, 6)), dims=["x", "y"], coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.random((8, 6)))}, ) if dask and has_dask: da = da.chunk({"x": 4}) ds = xr.Dataset({"var": da}) # along x actual = da.differentiate("x", edge_order) expected_x = xr.DataArray( np.gradient(da, da["x"], axis=0, edge_order=edge_order), dims=da.dims, coords=da.coords, ) assert_equal(expected_x, actual) assert_equal( ds["var"].differentiate("x", edge_order=edge_order), ds.differentiate("x", edge_order=edge_order)["var"], ) # coordinate should not change assert_equal(da["x"], actual["x"]) # along y actual = da.differentiate("y", edge_order) expected_y = xr.DataArray( np.gradient(da, da["y"], axis=1, edge_order=edge_order), dims=da.dims, coords=da.coords, ) assert_equal(expected_y, actual) assert_equal(actual, ds.differentiate("y", edge_order=edge_order)["var"]) assert_equal( ds["var"].differentiate("y", edge_order=edge_order), ds.differentiate("y", edge_order=edge_order)["var"], ) with pytest.raises(ValueError): da.differentiate("x2d") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: rs = np.random.default_rng(42) coord = np.array( [ "2004-07-13", "2006-01-13", "2010-08-13", "2010-09-13", "2010-10-11", "2010-12-13", "2011-02-13", "2012-08-13", ], dtype="datetime64", ) da = xr.DataArray( rs.random((8, 6)), dims=["x", "y"], coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.random((8, 6)))}, ) if dask and has_dask: da = da.chunk({"x": 4}) # along x actual = da.differentiate("x", edge_order=1, datetime_unit="D") expected_x = xr.DataArray( np.gradient( da, da["x"].variable._to_numeric(datetime_unit="D"), axis=0, edge_order=1 ), dims=da.dims, coords=da.coords, ) assert_equal(expected_x, actual) actual2 = da.differentiate("x", edge_order=1, datetime_unit="h") assert np.allclose(actual, actual2 * 24) # for datetime variable actual = da["x"].differentiate("x", edge_order=1, datetime_unit="D") assert np.allclose(actual, 1.0) # with different date unit da = xr.DataArray(coord.astype("datetime64[ms]"), dims=["x"], coords={"x": coord}) actual = da.differentiate("x", edge_order=1) assert np.allclose(actual, 1.0) @requires_cftime @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_cftime(dask) -> None: rs = np.random.default_rng(42) coord = xr.cftime_range("2000", periods=8, freq="2ME") da = xr.DataArray( rs.random((8, 6)), coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.random((8, 6)))}, dims=["time", "y"], ) if dask and has_dask: da = da.chunk({"time": 4}) actual = da.differentiate("time", edge_order=1, datetime_unit="D") expected_data = np.gradient( da, da["time"].variable._to_numeric(datetime_unit="D"), axis=0, edge_order=1 ) expected = xr.DataArray(expected_data, coords=da.coords, dims=da.dims) assert_equal(expected, actual) actual2 = da.differentiate("time", edge_order=1, datetime_unit="h") assert_allclose(actual, actual2 * 24) # Test the differentiation of datetimes themselves actual = da["time"].differentiate("time", edge_order=1, datetime_unit="D") assert_allclose(actual, xr.ones_like(da["time"]).astype(float)) @pytest.mark.parametrize("dask", [True, False]) def test_integrate(dask) -> None: rs = np.random.default_rng(42) coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8] da = xr.DataArray( rs.random((8, 6)), dims=["x", "y"], coords={ "x": coord, "x2": (("x",), rs.random(8)), "z": 3, "x2d": (("x", "y"), rs.random((8, 6))), }, ) if dask and has_dask: da = da.chunk({"x": 4}) ds = xr.Dataset({"var": da}) # along x actual = da.integrate("x") # coordinate that contains x should be dropped. expected_x = xr.DataArray( trapezoid(da.compute(), da["x"], axis=0), dims=["y"], coords={k: v for k, v in da.coords.items() if "x" not in v.dims}, ) assert_allclose(expected_x, actual.compute()) assert_equal(ds["var"].integrate("x"), ds.integrate("x")["var"]) # make sure result is also a dask array (if the source is dask array) assert isinstance(actual.data, type(da.data)) # along y actual = da.integrate("y") expected_y = xr.DataArray( trapezoid(da, da["y"], axis=1), dims=["x"], coords={k: v for k, v in da.coords.items() if "y" not in v.dims}, ) assert_allclose(expected_y, actual.compute()) assert_equal(actual, ds.integrate("y")["var"]) assert_equal(ds["var"].integrate("y"), ds.integrate("y")["var"]) # along x and y actual = da.integrate(("y", "x")) assert actual.ndim == 0 with pytest.raises(ValueError): da.integrate("x2d") @requires_scipy @pytest.mark.parametrize("dask", [True, False]) def test_cumulative_integrate(dask) -> None: rs = np.random.default_rng(43) coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8] da = xr.DataArray( rs.random((8, 6)), dims=["x", "y"], coords={ "x": coord, "x2": (("x",), rs.random(8)), "z": 3, "x2d": (("x", "y"), rs.random((8, 6))), }, ) if dask and has_dask: da = da.chunk({"x": 4}) ds = xr.Dataset({"var": da}) # along x actual = da.cumulative_integrate("x") from scipy.integrate import cumulative_trapezoid expected_x = xr.DataArray( cumulative_trapezoid(da.compute(), da["x"], axis=0, initial=0.0), dims=["x", "y"], coords=da.coords, ) assert_allclose(expected_x, actual.compute()) assert_equal( ds["var"].cumulative_integrate("x"), ds.cumulative_integrate("x")["var"], ) # make sure result is also a dask array (if the source is dask array) assert isinstance(actual.data, type(da.data)) # along y actual = da.cumulative_integrate("y") expected_y = xr.DataArray( cumulative_trapezoid(da, da["y"], axis=1, initial=0.0), dims=["x", "y"], coords=da.coords, ) assert_allclose(expected_y, actual.compute()) assert_equal(actual, ds.cumulative_integrate("y")["var"]) assert_equal( ds["var"].cumulative_integrate("y"), ds.cumulative_integrate("y")["var"], ) # along x and y actual = da.cumulative_integrate(("y", "x")) assert actual.ndim == 2 with pytest.raises(ValueError): da.cumulative_integrate("x2d") @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapezoid_datetime(dask, which_datetime) -> None: rs = np.random.default_rng(42) coord: ArrayLike if which_datetime == "np": coord = np.array( [ "2004-07-13", "2006-01-13", "2010-08-13", "2010-09-13", "2010-10-11", "2010-12-13", "2011-02-13", "2012-08-13", ], dtype="datetime64", ) else: if not has_cftime: pytest.skip("Test requires cftime.") coord = xr.cftime_range("2000", periods=8, freq="2D") da = xr.DataArray( rs.random((8, 6)), coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.random((8, 6)))}, dims=["time", "y"], ) if dask and has_dask: da = da.chunk({"time": 4}) actual = da.integrate("time", datetime_unit="D") expected_data = trapezoid( da.compute().data, duck_array_ops.datetime_to_numeric(da["time"].data, datetime_unit="D"), axis=0, ) expected = xr.DataArray( expected_data, dims=["y"], coords={k: v for k, v in da.coords.items() if "time" not in v.dims}, ) assert_allclose(expected, actual.compute()) # make sure result is also a dask array (if the source is dask array) assert isinstance(actual.data, type(da.data)) actual2 = da.integrate("time", datetime_unit="h") assert_allclose(actual, actual2 / 24.0) def test_no_dict() -> None: d = Dataset() with pytest.raises(AttributeError): _ = d.__dict__ def test_subclass_slots() -> None: """Test that Dataset subclasses must explicitly define ``__slots__``. .. note:: As of 0.13.0, this is actually mitigated into a FutureWarning for any class defined outside of the xarray package. """ with pytest.raises(AttributeError) as e: class MyDS(Dataset): pass assert str(e.value) == "MyDS must explicitly define __slots__" def test_weakref() -> None: """Classes with __slots__ are incompatible with the weakref module unless they explicitly state __weakref__ among their slots """ from weakref import ref ds = Dataset() r = ref(ds) assert r() is ds def test_deepcopy_obj_array() -> None: x0 = Dataset(dict(foo=DataArray(np.array([object()])))) x1 = deepcopy(x0) assert x0["foo"].values[0] is not x1["foo"].values[0] def test_deepcopy_recursive() -> None: # GH:issue:7111 # direct recursion ds = xr.Dataset({"a": (["x"], [1, 2])}) ds.attrs["other"] = ds # TODO: cannot use assert_identical on recursive Vars yet... # lets just ensure that deep copy works without RecursionError ds.copy(deep=True) # indirect recursion ds2 = xr.Dataset({"b": (["y"], [3, 4])}) ds.attrs["other"] = ds2 ds2.attrs["other"] = ds # TODO: cannot use assert_identical on recursive Vars yet... # lets just ensure that deep copy works without RecursionError ds.copy(deep=True) ds2.copy(deep=True) def test_clip(ds) -> None: result = ds.clip(min=0.5) assert all((result.min(...) >= 0.5).values()) result = ds.clip(max=0.5) assert all((result.max(...) <= 0.5).values()) result = ds.clip(min=0.25, max=0.75) assert all((result.min(...) >= 0.25).values()) assert all((result.max(...) <= 0.75).values()) result = ds.clip(min=ds.mean("y"), max=ds.mean("y")) assert result.sizes == ds.sizes class TestDropDuplicates: @pytest.mark.parametrize("keep", ["first", "last", False]) def test_drop_duplicates_1d(self, keep) -> None: ds = xr.Dataset( {"a": ("time", [0, 5, 6, 7]), "b": ("time", [9, 3, 8, 2])}, coords={"time": [0, 0, 1, 2]}, ) if keep == "first": a = [0, 6, 7] b = [9, 8, 2] time = [0, 1, 2] elif keep == "last": a = [5, 6, 7] b = [3, 8, 2] time = [0, 1, 2] else: a = [6, 7] b = [8, 2] time = [1, 2] expected = xr.Dataset( {"a": ("time", a), "b": ("time", b)}, coords={"time": time} ) result = ds.drop_duplicates("time", keep=keep) assert_equal(expected, result) with pytest.raises( ValueError, match=re.escape( "Dimensions ('space',) not found in data dimensions ('time',)" ), ): ds.drop_duplicates("space", keep=keep) class TestNumpyCoercion: def test_from_numpy(self) -> None: ds = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"lat": ("x", [4, 5, 6])}) assert_identical(ds.as_numpy(), ds) @requires_dask def test_from_dask(self) -> None: ds = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"lat": ("x", [4, 5, 6])}) ds_chunked = ds.chunk(1) assert_identical(ds_chunked.as_numpy(), ds.compute()) @requires_pint def test_from_pint(self) -> None: from pint import Quantity arr = np.array([1, 2, 3]) ds = xr.Dataset( {"a": ("x", Quantity(arr, units="Pa"))}, coords={"lat": ("x", Quantity(arr + 3, units="m"))}, ) expected = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"lat": ("x", arr + 3)}) assert_identical(ds.as_numpy(), expected) @requires_sparse def test_from_sparse(self) -> None: import sparse arr = np.diagflat([1, 2, 3]) sparr = sparse.COO.from_numpy(arr) ds = xr.Dataset( {"a": (["x", "y"], sparr)}, coords={"elev": (("x", "y"), sparr + 3)} ) expected = xr.Dataset( {"a": (["x", "y"], arr)}, coords={"elev": (("x", "y"), arr + 3)} ) assert_identical(ds.as_numpy(), expected) @requires_cupy def test_from_cupy(self) -> None: import cupy as cp arr = np.array([1, 2, 3]) ds = xr.Dataset( {"a": ("x", cp.array(arr))}, coords={"lat": ("x", cp.array(arr + 3))} ) expected = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"lat": ("x", arr + 3)}) assert_identical(ds.as_numpy(), expected) @requires_dask @requires_pint def test_from_pint_wrapping_dask(self) -> None: import dask from pint import Quantity arr = np.array([1, 2, 3]) d = dask.array.from_array(arr) ds = xr.Dataset( {"a": ("x", Quantity(d, units="Pa"))}, coords={"lat": ("x", Quantity(d, units="m") * 2)}, ) result = ds.as_numpy() expected = xr.Dataset({"a": ("x", arr)}, coords={"lat": ("x", arr * 2)}) assert_identical(result, expected) def test_string_keys_typing() -> None: """Tests that string keys to `variables` are permitted by mypy""" da = xr.DataArray(np.arange(10), dims=["x"]) ds = xr.Dataset(dict(x=da)) mapping = {"y": da} ds.assign(variables=mapping) def test_transpose_error() -> None: # Transpose dataset with list as argument # Should raise error ds = xr.Dataset({"foo": (("x", "y"), [[21]]), "bar": (("x", "y"), [[12]])}) with pytest.raises( TypeError, match=re.escape( "transpose requires dim to be passed as multiple arguments. Expected `'y', 'x'`. Received `['y', 'x']` instead" ), ): ds.transpose(["y", "x"]) # type: ignore[arg-type]