from __future__ import annotations from collections.abc import Callable from copy import deepcopy from typing import TYPE_CHECKING, Any, Literal import numpy as np import pandas as pd import pytest from xarray import DataArray, Dataset, Variable, concat from xarray.core import dtypes, merge from xarray.core.coordinates import Coordinates from xarray.core.indexes import PandasIndex from xarray.tests import ( ConcatenatableArray, InaccessibleArray, UnexpectedDataAccess, assert_array_equal, assert_equal, assert_identical, requires_dask, ) from xarray.tests.test_dataset import create_test_data if TYPE_CHECKING: from xarray.core.types import CombineAttrsOptions, JoinOptions # helper method to create multiple tests datasets to concat def create_concat_datasets( num_datasets: int = 2, seed: int | None = None, include_day: bool = True ) -> list[Dataset]: rng = np.random.default_rng(seed) lat = rng.standard_normal(size=(1, 4)) lon = rng.standard_normal(size=(1, 4)) result = [] variables = ["temperature", "pressure", "humidity", "precipitation", "cloud_cover"] for i in range(num_datasets): if include_day: data_tuple = ( ["x", "y", "day"], rng.standard_normal(size=(1, 4, 2)), ) data_vars = {v: data_tuple for v in variables} result.append( Dataset( data_vars=data_vars, coords={ "lat": (["x", "y"], lat), "lon": (["x", "y"], lon), "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], }, ) ) else: data_tuple = ( ["x", "y"], rng.standard_normal(size=(1, 4)), ) data_vars = {v: data_tuple for v in variables} result.append( Dataset( data_vars=data_vars, coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)}, ) ) return result # helper method to create multiple tests datasets to concat with specific types def create_typed_datasets( num_datasets: int = 2, seed: int | None = None ) -> list[Dataset]: var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"] rng = np.random.default_rng(seed) lat = rng.standard_normal(size=(1, 4)) lon = rng.standard_normal(size=(1, 4)) return [ Dataset( data_vars={ "float": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))), "float2": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))), "string": ( ["x", "y", "day"], rng.choice(var_strings, size=(1, 4, 2)), ), "int": (["x", "y", "day"], rng.integers(0, 10, size=(1, 4, 2))), "datetime64": ( ["x", "y", "day"], np.arange( np.datetime64("2017-01-01"), np.datetime64("2017-01-09") ).reshape(1, 4, 2), ), "timedelta64": ( ["x", "y", "day"], np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]), ), }, coords={ "lat": (["x", "y"], lat), "lon": (["x", "y"], lon), "day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)], }, ) for i in range(num_datasets) ] def test_concat_compat() -> None: ds1 = Dataset( { "has_x_y": (("y", "x"), [[1, 2]]), "has_x": ("x", [1, 2]), "no_x_y": ("z", [1, 2]), }, coords={"x": [0, 1], "y": [0], "z": [-1, -2]}, ) ds2 = Dataset( { "has_x_y": (("y", "x"), [[3, 4]]), "has_x": ("x", [1, 2]), "no_x_y": (("q", "z"), [[1, 2]]), }, coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]}, ) result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals") assert_equal(ds2.no_x_y, result.no_x_y.transpose()) for var in ["has_x", "no_x_y"]: assert "y" not in result[var].dims and "y" not in result[var].coords with pytest.raises(ValueError, match=r"'q' not present in all datasets"): concat([ds1, ds2], dim="q") with pytest.raises(ValueError, match=r"'q' not present in all datasets"): concat([ds2, ds1], dim="q") def test_concat_missing_var() -> None: datasets = create_concat_datasets(2, seed=123) expected = concat(datasets, dim="day") vars_to_drop = ["humidity", "precipitation", "cloud_cover"] expected = expected.drop_vars(vars_to_drop) expected["pressure"][..., 2:] = np.nan datasets[0] = datasets[0].drop_vars(vars_to_drop) datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"]) actual = concat(datasets, dim="day") assert list(actual.data_vars.keys()) == ["temperature", "pressure"] assert_identical(actual, expected) def test_concat_categorical() -> None: data1 = create_test_data(use_extension_array=True) data2 = create_test_data(use_extension_array=True) concatenated = concat([data1, data2], dim="dim1") assert ( concatenated["var4"] == type(data2["var4"].variable.data.array)._concat_same_type( [ data1["var4"].variable.data.array, data2["var4"].variable.data.array, ] ) ).all() def test_concat_missing_multiple_consecutive_var() -> None: datasets = create_concat_datasets(3, seed=123) expected = concat(datasets, dim="day") vars_to_drop = ["humidity", "pressure"] expected["pressure"][..., :4] = np.nan expected["humidity"][..., :4] = np.nan datasets[0] = datasets[0].drop_vars(vars_to_drop) datasets[1] = datasets[1].drop_vars(vars_to_drop) actual = concat(datasets, dim="day") assert list(actual.data_vars.keys()) == [ "temperature", "precipitation", "cloud_cover", "pressure", "humidity", ] assert_identical(actual, expected) def test_concat_all_empty() -> None: ds1 = Dataset() ds2 = Dataset() expected = Dataset() actual = concat([ds1, ds2], dim="new_dim") assert_identical(actual, expected) def test_concat_second_empty() -> None: ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1}) ds2 = Dataset(coords={"x": 0.1}) expected = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1}) actual = concat([ds1, ds2], dim="y") assert_identical(actual, expected) expected = Dataset( data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])} ) actual = concat([ds1, ds2], dim="y", coords="all") assert_identical(actual, expected) # Check concatenating scalar data_var only present in ds1 ds1["b"] = 0.1 expected = Dataset( data_vars={"a": ("y", [0.1, np.nan]), "b": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])}, ) actual = concat([ds1, ds2], dim="y", coords="all", data_vars="all") assert_identical(actual, expected) expected = Dataset( data_vars={"a": ("y", [0.1, np.nan]), "b": 0.1}, coords={"x": 0.1} ) actual = concat([ds1, ds2], dim="y", coords="different", data_vars="different") assert_identical(actual, expected) def test_concat_multiple_missing_variables() -> None: datasets = create_concat_datasets(2, seed=123) expected = concat(datasets, dim="day") vars_to_drop = ["pressure", "cloud_cover"] expected["pressure"][..., 2:] = np.nan expected["cloud_cover"][..., 2:] = np.nan datasets[1] = datasets[1].drop_vars(vars_to_drop) actual = concat(datasets, dim="day") # check the variables orders are the same assert list(actual.data_vars.keys()) == [ "temperature", "pressure", "humidity", "precipitation", "cloud_cover", ] assert_identical(actual, expected) @pytest.mark.parametrize("include_day", [True, False]) def test_concat_multiple_datasets_missing_vars(include_day: bool) -> None: vars_to_drop = [ "temperature", "pressure", "humidity", "precipitation", "cloud_cover", ] datasets = create_concat_datasets( len(vars_to_drop), seed=123, include_day=include_day ) expected = concat(datasets, dim="day") for i, name in enumerate(vars_to_drop): if include_day: expected[name][..., i * 2 : (i + 1) * 2] = np.nan else: expected[name][i : i + 1, ...] = np.nan # set up the test data datasets = [ ds.drop_vars(varname) for ds, varname in zip(datasets, vars_to_drop, strict=True) ] actual = concat(datasets, dim="day") assert list(actual.data_vars.keys()) == [ "pressure", "humidity", "precipitation", "cloud_cover", "temperature", ] assert_identical(actual, expected) def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: vars_to_drop_in_first = ["temperature", "pressure"] vars_to_drop_in_second = ["humidity", "precipitation", "cloud_cover"] datasets = create_concat_datasets(2, seed=123) expected = concat(datasets, dim="day") for name in vars_to_drop_in_first: expected[name][..., :2] = np.nan for name in vars_to_drop_in_second: expected[name][..., 2:] = np.nan # set up the test data datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) actual = concat(datasets, dim="day") assert list(actual.data_vars.keys()) == [ "humidity", "precipitation", "cloud_cover", "temperature", "pressure", ] assert_identical(actual, expected) def test_concat_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) expected2 = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"] expected = [expected2, expected1] for i, exp in enumerate(expected): sl = slice(i * 2, (i + 1) * 2) exp["float2"][..., sl] = np.nan exp["datetime64"][..., sl] = np.nan exp["timedelta64"][..., sl] = np.nan var = exp["int"] * 1.0 var[..., sl] = np.nan exp["int"] = var var = exp["string"].astype(object) var[..., sl] = np.nan exp["string"] = var # set up the test data datasets[1] = datasets[1].drop_vars(vars[1:]) actual = concat(datasets, dim="day", fill_value=dtypes.NA) assert_identical(actual, expected[1]) # reversed actual = concat(datasets[::-1], dim="day", fill_value=dtypes.NA) assert_identical(actual, expected[0]) def test_concat_order_when_filling_missing() -> None: vars_to_drop_in_first: list[str] = [] # drop middle vars_to_drop_in_second = ["humidity"] datasets = create_concat_datasets(2, seed=123) expected1 = concat(datasets, dim="day") for name in vars_to_drop_in_second: expected1[name][..., 2:] = np.nan expected2 = concat(datasets[::-1], dim="day") for name in vars_to_drop_in_second: expected2[name][..., :2] = np.nan # set up the test data datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first) datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second) actual = concat(datasets, dim="day") assert list(actual.data_vars.keys()) == [ "temperature", "pressure", "humidity", "precipitation", "cloud_cover", ] assert_identical(actual, expected1) actual = concat(datasets[::-1], dim="day") assert list(actual.data_vars.keys()) == [ "temperature", "pressure", "precipitation", "cloud_cover", "humidity", ] assert_identical(actual, expected2) @pytest.fixture def concat_var_names() -> Callable: # create var names list with one missing value def get_varnames(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]: orig = [f"d{i:02d}" for i in range(var_cnt)] var_names = [] for _i in range(list_cnt): l1 = orig.copy() var_names.append(l1) return var_names return get_varnames @pytest.fixture def create_concat_ds() -> Callable: def create_ds( var_names: list[list[str]], dim: bool = False, coord: bool = False, drop_idx: list[int] | None = None, ) -> list[Dataset]: out_ds = [] ds = Dataset() ds = ds.assign_coords({"x": np.arange(2)}) ds = ds.assign_coords({"y": np.arange(3)}) ds = ds.assign_coords({"z": np.arange(4)}) for i, dsl in enumerate(var_names): vlist = dsl.copy() if drop_idx is not None: vlist.pop(drop_idx[i]) foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4) dsi = ds.copy() if coord: dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])}) for k in vlist: dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())}) if not dim: dsi = dsi.isel(time=0) out_ds.append(dsi) return out_ds return create_ds @pytest.mark.parametrize("dim", [True, False]) @pytest.mark.parametrize("coord", [True, False]) def test_concat_fill_missing_variables( concat_var_names, create_concat_ds, dim: bool, coord: bool ) -> None: var_names = concat_var_names() drop_idx = [0, 7, 6, 4, 4, 8, 0, 6, 2, 0] expected = concat( create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all" ) for i, idx in enumerate(drop_idx): if dim: expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan else: expected[var_names[0][idx]][i] = np.nan concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx) actual = concat(concat_ds, dim="time", data_vars="all") assert list(actual.data_vars.keys()) == [ "d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", "d00", ] assert_identical(actual, expected) class TestConcatDataset: @pytest.fixture def data(self, request) -> Dataset: use_extension_array = request.param if hasattr(request, "param") else False return create_test_data(use_extension_array=use_extension_array).drop_dims( "dim3" ) def rectify_dim_order(self, data: Dataset, dataset) -> Dataset: # return a new dataset with all variable dimensions transposed into # the order in which they are found in `data` return Dataset( {k: v.transpose(*data[k].dims) for k, v in dataset.data_vars.items()}, dataset.coords, attrs=dataset.attrs, ) @pytest.mark.parametrize("coords", ["different", "minimal"]) @pytest.mark.parametrize( "dim,data", [["dim1", True], ["dim2", False]], indirect=["data"] ) def test_concat_simple(self, data: Dataset, dim, coords) -> None: datasets = [g for _, g in data.groupby(dim, squeeze=False)] assert_identical(data, concat(datasets, dim, coords=coords)) def test_concat_merge_variables_present_in_some_datasets( self, data: Dataset ) -> None: # coordinates present in some datasets but not others ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1}) ds2 = Dataset(data_vars={"a": ("y", [0.2])}, coords={"z": 0.2}) actual = concat([ds1, ds2], dim="y", coords="minimal") expected = Dataset({"a": ("y", [0.1, 0.2])}, coords={"x": 0.1, "z": 0.2}) assert_identical(expected, actual) # data variables present in some datasets but not others split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] data0, data1 = deepcopy(split_data) data1["foo"] = ("bar", np.random.randn(10)) actual = concat([data0, data1], "dim1", data_vars="minimal") expected = data.copy().assign(foo=data1.foo) assert_identical(expected, actual) # expand foo actual = concat([data0, data1], "dim1") foo = np.ones((8, 10), dtype=data1.foo.dtype) * np.nan foo[3:] = data1.foo.values[None, ...] expected = data.copy().assign(foo=(["dim1", "bar"], foo)) assert_identical(expected, actual) @pytest.mark.parametrize("data", [False], indirect=["data"]) def test_concat_2(self, data: Dataset) -> None: dim = "dim2" datasets = [g.squeeze(dim) for _, g in data.groupby(dim, squeeze=False)] concat_over = [k for k, v in data.coords.items() if dim in v.dims and k != dim] actual = concat(datasets, data[dim], coords=concat_over) assert_identical(data, self.rectify_dim_order(data, actual)) @pytest.mark.parametrize("coords", ["different", "minimal", "all"]) @pytest.mark.parametrize("dim", ["dim1", "dim2"]) def test_concat_coords_kwarg( self, data: Dataset, dim: str, coords: Literal["all", "minimal", "different"] ) -> None: data = data.copy(deep=True) # make sure the coords argument behaves as expected data.coords["extra"] = ("dim4", np.arange(3)) datasets = [g.squeeze() for _, g in data.groupby(dim, squeeze=False)] actual = concat(datasets, data[dim], coords=coords) if coords == "all": expected = np.array([data["extra"].values for _ in range(data.sizes[dim])]) assert_array_equal(actual["extra"].values, expected) else: assert_equal(data["extra"], actual["extra"]) def test_concat(self, data: Dataset) -> None: split_data = [ data.isel(dim1=slice(3)), data.isel(dim1=3), data.isel(dim1=slice(4, None)), ] assert_identical(data, concat(split_data, "dim1")) def test_concat_dim_precedence(self, data: Dataset) -> None: # verify that the dim argument takes precedence over # concatenating dataset variables of the same name dim = (2 * data["dim1"]).rename("dim1") datasets = [g for _, g in data.groupby("dim1", squeeze=False)] expected = data.copy() expected["dim1"] = dim assert_identical(expected, concat(datasets, dim)) def test_concat_data_vars_typing(self) -> None: # Testing typing, can be removed if the next function works with annotations. data = Dataset({"foo": ("x", np.random.randn(10))}) objs: list[Dataset] = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] actual = concat(objs, dim="x", data_vars="minimal") assert_identical(data, actual) def test_concat_data_vars(self) -> None: data = Dataset({"foo": ("x", np.random.randn(10))}) objs: list[Dataset] = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] for data_vars in ["minimal", "different", "all", [], ["foo"]]: actual = concat(objs, dim="x", data_vars=data_vars) assert_identical(data, actual) def test_concat_coords(self): # TODO: annotating this func fails data = Dataset({"foo": ("x", np.random.randn(10))}) expected = data.assign_coords(c=("x", [0] * 5 + [1] * 5)) objs = [ data.isel(x=slice(5)).assign_coords(c=0), data.isel(x=slice(5, None)).assign_coords(c=1), ] for coords in ["different", "all", ["c"]]: actual = concat(objs, dim="x", coords=coords) assert_identical(expected, actual) for coords in ["minimal", []]: with pytest.raises(merge.MergeError, match="conflicting values"): concat(objs, dim="x", coords=coords) def test_concat_constant_index(self): # TODO: annotating this func fails # GH425 ds1 = Dataset({"foo": 1.5}, {"y": 1}) ds2 = Dataset({"foo": 2.5}, {"y": 1}) expected = Dataset({"foo": ("y", [1.5, 2.5]), "y": [1, 1]}) for mode in ["different", "all", ["foo"]]: actual = concat([ds1, ds2], "y", data_vars=mode) assert_identical(expected, actual) with pytest.raises(merge.MergeError, match="conflicting values"): # previously dim="y", and raised error which makes no sense. # "foo" has dimension "y" so minimal should concatenate it? concat([ds1, ds2], "new_dim", data_vars="minimal") def test_concat_size0(self) -> None: data = create_test_data() split_data = [data.isel(dim1=slice(0, 0)), data] actual = concat(split_data, "dim1") assert_identical(data, actual) actual = concat(split_data[::-1], "dim1") assert_identical(data, actual) def test_concat_autoalign(self) -> None: ds1 = Dataset({"foo": DataArray([1, 2], coords=[("x", [1, 2])])}) ds2 = Dataset({"foo": DataArray([1, 2], coords=[("x", [1, 3])])}) actual = concat([ds1, ds2], "y") expected = Dataset( { "foo": DataArray( [[1, 2, np.nan], [1, np.nan, 2]], dims=["y", "x"], coords={"x": [1, 2, 3]}, ) } ) assert_identical(expected, actual) def test_concat_errors(self): # TODO: annotating this func fails data = create_test_data() split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] with pytest.raises(ValueError, match=r"must supply at least one"): concat([], "dim1") with pytest.raises(ValueError, match=r"Cannot specify both .*='different'"): concat( [data, data], dim="concat_dim", data_vars="different", compat="override" ) with pytest.raises(ValueError, match=r"must supply at least one"): concat([], "dim1") with pytest.raises(ValueError, match=r"are not found in the coordinates"): concat([data, data], "new_dim", coords=["not_found"]) with pytest.raises(ValueError, match=r"are not found in the data variables"): concat([data, data], "new_dim", data_vars=["not_found"]) with pytest.raises(ValueError, match=r"global attributes not"): # call deepcopy separately to get unique attrs data0 = deepcopy(split_data[0]) data1 = deepcopy(split_data[1]) data1.attrs["foo"] = "bar" concat([data0, data1], "dim1", compat="identical") assert_identical(data, concat([data0, data1], "dim1", compat="equals")) with pytest.raises(ValueError, match=r"compat.* invalid"): concat(split_data, "dim1", compat="foobar") with pytest.raises(ValueError, match=r"compat.* invalid"): concat(split_data, "dim1", compat="minimal") with pytest.raises(ValueError, match=r"unexpected value for"): concat([data, data], "new_dim", coords="foobar") with pytest.raises( ValueError, match=r"coordinate in some datasets but not others" ): concat([Dataset({"x": 0}), Dataset({"x": [1]})], dim="z") with pytest.raises( ValueError, match=r"coordinate in some datasets but not others" ): concat([Dataset({"x": 0}), Dataset({}, {"x": 1})], dim="z") def test_concat_join_kwarg(self) -> None: ds1 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [0], "y": [0]}) ds2 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [1], "y": [0.0001]}) expected: dict[JoinOptions, Any] = {} expected["outer"] = Dataset( {"a": (("x", "y"), [[0, np.nan], [np.nan, 0]])}, {"x": [0, 1], "y": [0, 0.0001]}, ) expected["inner"] = Dataset( {"a": (("x", "y"), [[], []])}, {"x": [0, 1], "y": []} ) expected["left"] = Dataset( {"a": (("x", "y"), np.array([0, np.nan], ndmin=2).T)}, coords={"x": [0, 1], "y": [0]}, ) expected["right"] = Dataset( {"a": (("x", "y"), np.array([np.nan, 0], ndmin=2).T)}, coords={"x": [0, 1], "y": [0.0001]}, ) expected["override"] = Dataset( {"a": (("x", "y"), np.array([0, 0], ndmin=2).T)}, coords={"x": [0, 1], "y": [0]}, ) with pytest.raises(ValueError, match=r"cannot align.*exact.*dimensions.*'y'"): actual = concat([ds1, ds2], join="exact", dim="x") for join in expected: actual = concat([ds1, ds2], join=join, dim="x") assert_equal(actual, expected[join]) # regression test for #3681 actual = concat( [ds1.drop_vars("x"), ds2.drop_vars("x")], join="override", dim="y" ) expected2 = Dataset( {"a": (("x", "y"), np.array([0, 0], ndmin=2))}, coords={"y": [0, 0.0001]} ) assert_identical(actual, expected2) @pytest.mark.parametrize( "combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception", [ ( "no_conflicts", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2, "c": 3}, False, ), ("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False), ("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False), ( "no_conflicts", {"a": 1, "b": 2}, {"a": 4, "c": 3}, {"a": 1, "b": 2, "c": 3}, True, ), ("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False), ("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False), ("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True), ( "override", {"a": 1, "b": 2}, {"a": 4, "b": 5, "c": 3}, {"a": 1, "b": 2}, False, ), ( "drop_conflicts", {"a": 41, "b": 42, "c": 43}, {"b": 2, "c": 43, "d": 44}, {"a": 41, "c": 43, "d": 44}, False, ), ( lambda attrs, context: {"a": -1, "b": 0, "c": 1} if any(attrs) else {}, {"a": 41, "b": 42, "c": 43}, {"b": 2, "c": 43, "d": 44}, {"a": -1, "b": 0, "c": 1}, False, ), ], ) def test_concat_combine_attrs_kwarg( self, combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception ): ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs=var1_attrs) ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs=var2_attrs) if expect_exception: with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"): concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) else: actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) expected = Dataset( {"a": ("x", [0, 0])}, {"x": [0, 1]}, attrs=expected_attrs ) assert_identical(actual, expected) @pytest.mark.parametrize( "combine_attrs, attrs1, attrs2, expected_attrs, expect_exception", [ ( "no_conflicts", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2, "c": 3}, False, ), ("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False), ("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False), ( "no_conflicts", {"a": 1, "b": 2}, {"a": 4, "c": 3}, {"a": 1, "b": 2, "c": 3}, True, ), ("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False), ("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False), ("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True), ( "override", {"a": 1, "b": 2}, {"a": 4, "b": 5, "c": 3}, {"a": 1, "b": 2}, False, ), ( "drop_conflicts", {"a": 41, "b": 42, "c": 43}, {"b": 2, "c": 43, "d": 44}, {"a": 41, "c": 43, "d": 44}, False, ), ( lambda attrs, context: {"a": -1, "b": 0, "c": 1} if any(attrs) else {}, {"a": 41, "b": 42, "c": 43}, {"b": 2, "c": 43, "d": 44}, {"a": -1, "b": 0, "c": 1}, False, ), ], ) def test_concat_combine_attrs_kwarg_variables( self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception ): """check that combine_attrs is used on data variables and coords""" ds1 = Dataset({"a": ("x", [0], attrs1)}, coords={"x": ("x", [0], attrs1)}) ds2 = Dataset({"a": ("x", [0], attrs2)}, coords={"x": ("x", [1], attrs2)}) if expect_exception: with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"): concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) else: actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) expected = Dataset( {"a": ("x", [0, 0], expected_attrs)}, {"x": ("x", [0, 1], expected_attrs)}, ) assert_identical(actual, expected) def test_concat_promote_shape(self) -> None: # mixed dims within variables objs = [Dataset({}, {"x": 0}), Dataset({"x": [1]})] actual = concat(objs, "x") expected = Dataset({"x": [0, 1]}) assert_identical(actual, expected) objs = [Dataset({"x": [0]}), Dataset({}, {"x": 1})] actual = concat(objs, "x") assert_identical(actual, expected) # mixed dims between variables objs = [Dataset({"x": [2], "y": 3}), Dataset({"x": [4], "y": 5})] actual = concat(objs, "x") expected = Dataset({"x": [2, 4], "y": ("x", [3, 5])}) assert_identical(actual, expected) # mixed dims in coord variable objs = [Dataset({"x": [0]}, {"y": -1}), Dataset({"x": [1]}, {"y": ("x", [-2])})] actual = concat(objs, "x") expected = Dataset({"x": [0, 1]}, {"y": ("x", [-1, -2])}) assert_identical(actual, expected) # scalars with mixed lengths along concat dim -- values should repeat objs = [Dataset({"x": [0]}, {"y": -1}), Dataset({"x": [1, 2]}, {"y": -2})] actual = concat(objs, "x") expected = Dataset({"x": [0, 1, 2]}, {"y": ("x", [-1, -2, -2])}) assert_identical(actual, expected) # broadcast 1d x 1d -> 2d objs = [ Dataset({"z": ("x", [-1])}, {"x": [0], "y": [0]}), Dataset({"z": ("y", [1])}, {"x": [1], "y": [0]}), ] actual = concat(objs, "x") expected = Dataset({"z": (("x", "y"), [[-1], [1]])}, {"x": [0, 1], "y": [0]}) assert_identical(actual, expected) # regression GH6384 objs = [ Dataset({}, {"x": pd.Interval(-1, 0, closed="right")}), Dataset({"x": [pd.Interval(0, 1, closed="right")]}), ] actual = concat(objs, "x") expected = Dataset( { "x": [ pd.Interval(-1, 0, closed="right"), pd.Interval(0, 1, closed="right"), ] } ) assert_identical(actual, expected) # regression GH6416 (coord dtype) and GH6434 time_data1 = np.array(["2022-01-01", "2022-02-01"], dtype="datetime64[ns]") time_data2 = np.array("2022-03-01", dtype="datetime64[ns]") time_expected = np.array( ["2022-01-01", "2022-02-01", "2022-03-01"], dtype="datetime64[ns]" ) objs = [Dataset({}, {"time": time_data1}), Dataset({}, {"time": time_data2})] actual = concat(objs, "time") expected = Dataset({}, {"time": time_expected}) assert_identical(actual, expected) assert isinstance(actual.indexes["time"], pd.DatetimeIndex) def test_concat_do_not_promote(self) -> None: # GH438 objs = [ Dataset({"y": ("t", [1])}, {"x": 1, "t": [0]}), Dataset({"y": ("t", [2])}, {"x": 1, "t": [0]}), ] expected = Dataset({"y": ("t", [1, 2])}, {"x": 1, "t": [0, 0]}) actual = concat(objs, "t") assert_identical(expected, actual) objs = [ Dataset({"y": ("t", [1])}, {"x": 1, "t": [0]}), Dataset({"y": ("t", [2])}, {"x": 2, "t": [0]}), ] with pytest.raises(ValueError): concat(objs, "t", coords="minimal") def test_concat_dim_is_variable(self) -> None: objs = [Dataset({"x": 0}), Dataset({"x": 1})] coord = Variable("y", [3, 4], attrs={"foo": "bar"}) expected = Dataset({"x": ("y", [0, 1]), "y": coord}) actual = concat(objs, coord) assert_identical(actual, expected) def test_concat_dim_is_dataarray(self) -> None: objs = [Dataset({"x": 0}), Dataset({"x": 1})] coord = DataArray([3, 4], dims="y", attrs={"foo": "bar"}) expected = Dataset({"x": ("y", [0, 1]), "y": coord}) actual = concat(objs, coord) assert_identical(actual, expected) def test_concat_multiindex(self) -> None: midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]]) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") expected = Dataset(coords=midx_coords) actual = concat( [expected.isel(x=slice(2)), expected.isel(x=slice(2, None))], "x" ) assert expected.equals(actual) assert isinstance(actual.x.to_index(), pd.MultiIndex) def test_concat_along_new_dim_multiindex(self) -> None: # see https://github.com/pydata/xarray/issues/6881 level_names = ["x_level_0", "x_level_1"] midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]], names=level_names) midx_coords = Coordinates.from_pandas_multiindex(midx, "x") ds = Dataset(coords=midx_coords) concatenated = concat([ds], "new") actual = list(concatenated.xindexes.get_all_coords("x")) expected = ["x"] + level_names assert actual == expected @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"a": 2, "b": 1}]) def test_concat_fill_value(self, fill_value) -> None: datasets = [ Dataset({"a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2]}), Dataset({"a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1]}), ] if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array fill_value_a = fill_value_b = np.nan elif isinstance(fill_value, dict): fill_value_a = fill_value["a"] fill_value_b = fill_value["b"] else: fill_value_a = fill_value_b = fill_value expected = Dataset( { "a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]), "b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]), }, {"x": [0, 1, 2]}, ) actual = concat(datasets, dim="t", fill_value=fill_value) assert_identical(actual, expected) @pytest.mark.parametrize("dtype", [str, bytes]) @pytest.mark.parametrize("dim", ["x1", "x2"]) def test_concat_str_dtype(self, dtype, dim) -> None: data = np.arange(4).reshape([2, 2]) da1 = Dataset( { "data": (["x1", "x2"], data), "x1": [0, 1], "x2": np.array(["a", "b"], dtype=dtype), } ) da2 = Dataset( { "data": (["x1", "x2"], data), "x1": np.array([1, 2]), "x2": np.array(["c", "d"], dtype=dtype), } ) actual = concat([da1, da2], dim=dim) assert np.issubdtype(actual.x2.dtype, dtype) def test_concat_avoids_index_auto_creation(self) -> None: # TODO once passing indexes={} directly to Dataset constructor is allowed then no need to create coords first coords = Coordinates( {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} ) datasets = [ Dataset( {"a": (["x", "y"], ConcatenatableArray(np.zeros((3, 3))))}, coords=coords, ) for _ in range(2) ] # should not raise on concat combined = concat(datasets, dim="x") assert combined["a"].shape == (6, 3) assert combined["a"].dims == ("x", "y") # nor have auto-created any indexes assert combined.indexes == {} # should not raise on stack combined = concat(datasets, dim="z") assert combined["a"].shape == (2, 3, 3) assert combined["a"].dims == ("z", "x", "y") # nor have auto-created any indexes assert combined.indexes == {} def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: # create 0D coordinates (without indexes) datasets = [ Dataset( coords={"x": ConcatenatableArray(np.array(10))}, ) for _ in range(2) ] with pytest.raises(UnexpectedDataAccess): concat(datasets, dim="x", create_index_for_new_dim=True) # should not raise on concat iff create_index_for_new_dim=False combined = concat(datasets, dim="x", create_index_for_new_dim=False) assert combined["x"].shape == (2,) assert combined["x"].dims == ("x",) # nor have auto-created any indexes assert combined.indexes == {} def test_concat_promote_shape_without_creating_new_index(self) -> None: # different shapes but neither have indexes ds1 = Dataset(coords={"x": 0}) ds2 = Dataset(data_vars={"x": [1]}).drop_indexes("x") actual = concat([ds1, ds2], dim="x", create_index_for_new_dim=False) expected = Dataset(data_vars={"x": [0, 1]}).drop_indexes("x") assert_identical(actual, expected, check_default_indexes=False) assert actual.indexes == {} class TestConcatDataArray: def test_concat(self) -> None: ds = Dataset( { "foo": (["x", "y"], np.random.random((2, 3))), "bar": (["x", "y"], np.random.random((2, 3))), }, {"x": [0, 1]}, ) foo = ds["foo"] bar = ds["bar"] # from dataset array: expected = DataArray( np.array([foo.values, bar.values]), dims=["w", "x", "y"], coords={"x": [0, 1]}, ) actual = concat([foo, bar], "w") assert_equal(expected, actual) # from iteration: grouped = [g.squeeze() for _, g in foo.groupby("x", squeeze=False)] stacked = concat(grouped, ds["x"]) assert_identical(foo, stacked) # with an index as the 'dim' argument stacked = concat(grouped, pd.Index(ds["x"], name="x")) assert_identical(foo, stacked) actual2 = concat([foo[0], foo[1]], pd.Index([0, 1])).reset_coords(drop=True) expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual2) actual3 = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual3) with pytest.raises(ValueError, match=r"not identical"): concat([foo, bar], dim="w", compat="identical") with pytest.raises(ValueError, match=r"not a valid argument"): concat([foo, bar], dim="w", data_vars="minimal") def test_concat_encoding(self) -> None: # Regression test for GH1297 ds = Dataset( { "foo": (["x", "y"], np.random.random((2, 3))), "bar": (["x", "y"], np.random.random((2, 3))), }, {"x": [0, 1]}, ) foo = ds["foo"] foo.encoding = {"complevel": 5} ds.encoding = {"unlimited_dims": "x"} assert concat([foo, foo], dim="x").encoding == foo.encoding assert concat([ds, ds], dim="x").encoding == ds.encoding @requires_dask def test_concat_lazy(self) -> None: import dask.array as da arrays = [ DataArray( da.from_array(InaccessibleArray(np.zeros((3, 3))), 3), dims=["x", "y"] ) for _ in range(2) ] # should not raise combined = concat(arrays, dim="z") assert combined.shape == (2, 3, 3) assert combined.dims == ("z", "x", "y") def test_concat_avoids_index_auto_creation(self) -> None: # TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first coords = Coordinates( {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} ) arrays = [ DataArray( ConcatenatableArray(np.zeros((3, 3))), dims=["x", "y"], coords=coords, ) for _ in range(2) ] # should not raise on concat combined = concat(arrays, dim="x") assert combined.shape == (6, 3) assert combined.dims == ("x", "y") # nor have auto-created any indexes assert combined.indexes == {} # should not raise on stack combined = concat(arrays, dim="z") assert combined.shape == (2, 3, 3) assert combined.dims == ("z", "x", "y") # nor have auto-created any indexes assert combined.indexes == {} @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_concat_fill_value(self, fill_value) -> None: foo = DataArray([1, 2], coords=[("x", [1, 2])]) bar = DataArray([1, 2], coords=[("x", [1, 3])]) if fill_value == dtypes.NA: # if we supply the default, we expect the missing value for a # float array fill_value = np.nan expected = DataArray( [[1, 2, fill_value], [1, fill_value, 2]], dims=["y", "x"], coords={"x": [1, 2, 3]}, ) actual = concat((foo, bar), dim="y", fill_value=fill_value) assert_identical(actual, expected) def test_concat_join_kwarg(self) -> None: ds1 = Dataset( {"a": (("x", "y"), [[0]])}, coords={"x": [0], "y": [0]} ).to_dataarray() ds2 = Dataset( {"a": (("x", "y"), [[0]])}, coords={"x": [1], "y": [0.0001]} ).to_dataarray() expected: dict[JoinOptions, Any] = {} expected["outer"] = Dataset( {"a": (("x", "y"), [[0, np.nan], [np.nan, 0]])}, {"x": [0, 1], "y": [0, 0.0001]}, ) expected["inner"] = Dataset( {"a": (("x", "y"), [[], []])}, {"x": [0, 1], "y": []} ) expected["left"] = Dataset( {"a": (("x", "y"), np.array([0, np.nan], ndmin=2).T)}, coords={"x": [0, 1], "y": [0]}, ) expected["right"] = Dataset( {"a": (("x", "y"), np.array([np.nan, 0], ndmin=2).T)}, coords={"x": [0, 1], "y": [0.0001]}, ) expected["override"] = Dataset( {"a": (("x", "y"), np.array([0, 0], ndmin=2).T)}, coords={"x": [0, 1], "y": [0]}, ) with pytest.raises(ValueError, match=r"cannot align.*exact.*dimensions.*'y'"): actual = concat([ds1, ds2], join="exact", dim="x") for join in expected: actual = concat([ds1, ds2], join=join, dim="x") assert_equal(actual, expected[join].to_dataarray()) def test_concat_combine_attrs_kwarg(self) -> None: da1 = DataArray([0], coords=[("x", [0])], attrs={"b": 42}) da2 = DataArray([0], coords=[("x", [1])], attrs={"b": 42, "c": 43}) expected: dict[CombineAttrsOptions, Any] = {} expected["drop"] = DataArray([0, 0], coords=[("x", [0, 1])]) expected["no_conflicts"] = DataArray( [0, 0], coords=[("x", [0, 1])], attrs={"b": 42, "c": 43} ) expected["override"] = DataArray( [0, 0], coords=[("x", [0, 1])], attrs={"b": 42} ) with pytest.raises(ValueError, match=r"combine_attrs='identical'"): actual = concat([da1, da2], dim="x", combine_attrs="identical") with pytest.raises(ValueError, match=r"combine_attrs='no_conflicts'"): da3 = da2.copy(deep=True) da3.attrs["b"] = 44 actual = concat([da1, da3], dim="x", combine_attrs="no_conflicts") for combine_attrs in expected: actual = concat([da1, da2], dim="x", combine_attrs=combine_attrs) assert_identical(actual, expected[combine_attrs]) @pytest.mark.parametrize("dtype", [str, bytes]) @pytest.mark.parametrize("dim", ["x1", "x2"]) def test_concat_str_dtype(self, dtype, dim) -> None: data = np.arange(4).reshape([2, 2]) da1 = DataArray( data=data, dims=["x1", "x2"], coords={"x1": [0, 1], "x2": np.array(["a", "b"], dtype=dtype)}, ) da2 = DataArray( data=data, dims=["x1", "x2"], coords={"x1": np.array([1, 2]), "x2": np.array(["c", "d"], dtype=dtype)}, ) actual = concat([da1, da2], dim=dim) assert np.issubdtype(actual.x2.dtype, dtype) def test_concat_coord_name(self) -> None: da = DataArray([0], dims="a") da_concat = concat([da, da], dim=DataArray([0, 1], dims="b")) assert list(da_concat.coords) == ["b"] da_concat_std = concat([da, da], dim=DataArray([0, 1])) assert list(da_concat_std.coords) == ["dim_0"] @pytest.mark.parametrize("attr1", ({"a": {"meta": [10, 20, 30]}}, {"a": [1, 2, 3]}, {})) @pytest.mark.parametrize("attr2", ({"a": [1, 2, 3]}, {})) def test_concat_attrs_first_variable(attr1, attr2) -> None: arrs = [ DataArray([[1], [2]], dims=["x", "y"], attrs=attr1), DataArray([[3], [4]], dims=["x", "y"], attrs=attr2), ] concat_attrs = concat(arrs, "y").attrs assert concat_attrs == attr1 def test_concat_merge_single_non_dim_coord(): # TODO: annotating this func fails da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1}) da2 = DataArray([4, 5, 6], dims="x", coords={"x": [4, 5, 6]}) expected = DataArray(range(1, 7), dims="x", coords={"x": range(1, 7), "y": 1}) for coords in ["different", "minimal"]: actual = concat([da1, da2], "x", coords=coords) assert_identical(actual, expected) with pytest.raises(ValueError, match=r"'y' not present in all datasets."): concat([da1, da2], dim="x", coords="all") da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1}) da2 = DataArray([4, 5, 6], dims="x", coords={"x": [4, 5, 6]}) da3 = DataArray([7, 8, 9], dims="x", coords={"x": [7, 8, 9], "y": 1}) for coords in ["different", "all"]: with pytest.raises(ValueError, match=r"'y' not present in all datasets"): concat([da1, da2, da3], dim="x", coords=coords) def test_concat_preserve_coordinate_order() -> None: x = np.arange(0, 5) y = np.arange(0, 10) time = np.arange(0, 4) data = np.zeros((4, 10, 5), dtype=bool) ds1 = Dataset( {"data": (["time", "y", "x"], data[0:2])}, coords={"time": time[0:2], "y": y, "x": x}, ) ds2 = Dataset( {"data": (["time", "y", "x"], data[2:4])}, coords={"time": time[2:4], "y": y, "x": x}, ) expected = Dataset( {"data": (["time", "y", "x"], data)}, coords={"time": time, "y": y, "x": x}, ) actual = concat([ds1, ds2], dim="time") # check dimension order for act, exp in zip(actual.dims, expected.dims, strict=True): assert act == exp assert actual.sizes[act] == expected.sizes[exp] # check coordinate order for act, exp in zip(actual.coords, expected.coords, strict=True): assert act == exp assert_identical(actual.coords[act], expected.coords[exp]) def test_concat_typing_check() -> None: ds = Dataset({"foo": 1}, {"bar": 2}) da = Dataset({"foo": 3}, {"bar": 4}).to_dataarray(dim="foo") # concatenate a list of non-homogeneous types must raise TypeError with pytest.raises( TypeError, match="The elements in the input list need to be either all 'Dataset's or all 'DataArray's", ): concat([ds, da], dim="foo") # type: ignore[type-var] with pytest.raises( TypeError, match="The elements in the input list need to be either all 'Dataset's or all 'DataArray's", ): concat([da, ds], dim="foo") # type: ignore[type-var] def test_concat_not_all_indexes() -> None: ds1 = Dataset(coords={"x": ("x", [1, 2])}) # ds2.x has no default index ds2 = Dataset(coords={"x": ("y", [3, 4])}) with pytest.raises( ValueError, match=r"'x' must have either an index or no index in all datasets.*" ): concat([ds1, ds2], dim="x") def test_concat_index_not_same_dim() -> None: ds1 = Dataset(coords={"x": ("x", [1, 2])}) ds2 = Dataset(coords={"x": ("y", [3, 4])}) # TODO: use public API for setting a non-default index, when available ds2._indexes["x"] = PandasIndex([3, 4], "y") with pytest.raises( ValueError, match=r"Cannot concatenate along dimension 'x' indexes with dimensions.*", ): concat([ds1, ds2], dim="x")