CCR/.venv/lib/python3.12/site-packages/xarray/tests/test_concat.py

1381 lines
50 KiB
Python

from __future__ import annotations
from collections.abc import Callable
from copy import deepcopy
from typing import TYPE_CHECKING, Any, Literal
import numpy as np
import pandas as pd
import pytest
from xarray import DataArray, Dataset, Variable, concat
from xarray.core import dtypes, merge
from xarray.core.coordinates import Coordinates
from xarray.core.indexes import PandasIndex
from xarray.tests import (
ConcatenatableArray,
InaccessibleArray,
UnexpectedDataAccess,
assert_array_equal,
assert_equal,
assert_identical,
requires_dask,
)
from xarray.tests.test_dataset import create_test_data
if TYPE_CHECKING:
from xarray.core.types import CombineAttrsOptions, JoinOptions
# helper method to create multiple tests datasets to concat
def create_concat_datasets(
num_datasets: int = 2, seed: int | None = None, include_day: bool = True
) -> list[Dataset]:
rng = np.random.default_rng(seed)
lat = rng.standard_normal(size=(1, 4))
lon = rng.standard_normal(size=(1, 4))
result = []
variables = ["temperature", "pressure", "humidity", "precipitation", "cloud_cover"]
for i in range(num_datasets):
if include_day:
data_tuple = (
["x", "y", "day"],
rng.standard_normal(size=(1, 4, 2)),
)
data_vars = {v: data_tuple for v in variables}
result.append(
Dataset(
data_vars=data_vars,
coords={
"lat": (["x", "y"], lat),
"lon": (["x", "y"], lon),
"day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
},
)
)
else:
data_tuple = (
["x", "y"],
rng.standard_normal(size=(1, 4)),
)
data_vars = {v: data_tuple for v in variables}
result.append(
Dataset(
data_vars=data_vars,
coords={"lat": (["x", "y"], lat), "lon": (["x", "y"], lon)},
)
)
return result
# helper method to create multiple tests datasets to concat with specific types
def create_typed_datasets(
num_datasets: int = 2, seed: int | None = None
) -> list[Dataset]:
var_strings = ["a", "b", "c", "d", "e", "f", "g", "h"]
rng = np.random.default_rng(seed)
lat = rng.standard_normal(size=(1, 4))
lon = rng.standard_normal(size=(1, 4))
return [
Dataset(
data_vars={
"float": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))),
"float2": (["x", "y", "day"], rng.standard_normal(size=(1, 4, 2))),
"string": (
["x", "y", "day"],
rng.choice(var_strings, size=(1, 4, 2)),
),
"int": (["x", "y", "day"], rng.integers(0, 10, size=(1, 4, 2))),
"datetime64": (
["x", "y", "day"],
np.arange(
np.datetime64("2017-01-01"), np.datetime64("2017-01-09")
).reshape(1, 4, 2),
),
"timedelta64": (
["x", "y", "day"],
np.reshape([pd.Timedelta(days=i) for i in range(8)], [1, 4, 2]),
),
},
coords={
"lat": (["x", "y"], lat),
"lon": (["x", "y"], lon),
"day": ["day" + str(i * 2 + 1), "day" + str(i * 2 + 2)],
},
)
for i in range(num_datasets)
]
def test_concat_compat() -> None:
ds1 = Dataset(
{
"has_x_y": (("y", "x"), [[1, 2]]),
"has_x": ("x", [1, 2]),
"no_x_y": ("z", [1, 2]),
},
coords={"x": [0, 1], "y": [0], "z": [-1, -2]},
)
ds2 = Dataset(
{
"has_x_y": (("y", "x"), [[3, 4]]),
"has_x": ("x", [1, 2]),
"no_x_y": (("q", "z"), [[1, 2]]),
},
coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]},
)
result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals")
assert_equal(ds2.no_x_y, result.no_x_y.transpose())
for var in ["has_x", "no_x_y"]:
assert "y" not in result[var].dims and "y" not in result[var].coords
with pytest.raises(ValueError, match=r"'q' not present in all datasets"):
concat([ds1, ds2], dim="q")
with pytest.raises(ValueError, match=r"'q' not present in all datasets"):
concat([ds2, ds1], dim="q")
def test_concat_missing_var() -> None:
datasets = create_concat_datasets(2, seed=123)
expected = concat(datasets, dim="day")
vars_to_drop = ["humidity", "precipitation", "cloud_cover"]
expected = expected.drop_vars(vars_to_drop)
expected["pressure"][..., 2:] = np.nan
datasets[0] = datasets[0].drop_vars(vars_to_drop)
datasets[1] = datasets[1].drop_vars(vars_to_drop + ["pressure"])
actual = concat(datasets, dim="day")
assert list(actual.data_vars.keys()) == ["temperature", "pressure"]
assert_identical(actual, expected)
def test_concat_categorical() -> None:
data1 = create_test_data(use_extension_array=True)
data2 = create_test_data(use_extension_array=True)
concatenated = concat([data1, data2], dim="dim1")
assert (
concatenated["var4"]
== type(data2["var4"].variable.data.array)._concat_same_type(
[
data1["var4"].variable.data.array,
data2["var4"].variable.data.array,
]
)
).all()
def test_concat_missing_multiple_consecutive_var() -> None:
datasets = create_concat_datasets(3, seed=123)
expected = concat(datasets, dim="day")
vars_to_drop = ["humidity", "pressure"]
expected["pressure"][..., :4] = np.nan
expected["humidity"][..., :4] = np.nan
datasets[0] = datasets[0].drop_vars(vars_to_drop)
datasets[1] = datasets[1].drop_vars(vars_to_drop)
actual = concat(datasets, dim="day")
assert list(actual.data_vars.keys()) == [
"temperature",
"precipitation",
"cloud_cover",
"pressure",
"humidity",
]
assert_identical(actual, expected)
def test_concat_all_empty() -> None:
ds1 = Dataset()
ds2 = Dataset()
expected = Dataset()
actual = concat([ds1, ds2], dim="new_dim")
assert_identical(actual, expected)
def test_concat_second_empty() -> None:
ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1})
ds2 = Dataset(coords={"x": 0.1})
expected = Dataset(data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": 0.1})
actual = concat([ds1, ds2], dim="y")
assert_identical(actual, expected)
expected = Dataset(
data_vars={"a": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])}
)
actual = concat([ds1, ds2], dim="y", coords="all")
assert_identical(actual, expected)
# Check concatenating scalar data_var only present in ds1
ds1["b"] = 0.1
expected = Dataset(
data_vars={"a": ("y", [0.1, np.nan]), "b": ("y", [0.1, np.nan])},
coords={"x": ("y", [0.1, 0.1])},
)
actual = concat([ds1, ds2], dim="y", coords="all", data_vars="all")
assert_identical(actual, expected)
expected = Dataset(
data_vars={"a": ("y", [0.1, np.nan]), "b": 0.1}, coords={"x": 0.1}
)
actual = concat([ds1, ds2], dim="y", coords="different", data_vars="different")
assert_identical(actual, expected)
def test_concat_multiple_missing_variables() -> None:
datasets = create_concat_datasets(2, seed=123)
expected = concat(datasets, dim="day")
vars_to_drop = ["pressure", "cloud_cover"]
expected["pressure"][..., 2:] = np.nan
expected["cloud_cover"][..., 2:] = np.nan
datasets[1] = datasets[1].drop_vars(vars_to_drop)
actual = concat(datasets, dim="day")
# check the variables orders are the same
assert list(actual.data_vars.keys()) == [
"temperature",
"pressure",
"humidity",
"precipitation",
"cloud_cover",
]
assert_identical(actual, expected)
@pytest.mark.parametrize("include_day", [True, False])
def test_concat_multiple_datasets_missing_vars(include_day: bool) -> None:
vars_to_drop = [
"temperature",
"pressure",
"humidity",
"precipitation",
"cloud_cover",
]
datasets = create_concat_datasets(
len(vars_to_drop), seed=123, include_day=include_day
)
expected = concat(datasets, dim="day")
for i, name in enumerate(vars_to_drop):
if include_day:
expected[name][..., i * 2 : (i + 1) * 2] = np.nan
else:
expected[name][i : i + 1, ...] = np.nan
# set up the test data
datasets = [
ds.drop_vars(varname)
for ds, varname in zip(datasets, vars_to_drop, strict=True)
]
actual = concat(datasets, dim="day")
assert list(actual.data_vars.keys()) == [
"pressure",
"humidity",
"precipitation",
"cloud_cover",
"temperature",
]
assert_identical(actual, expected)
def test_concat_multiple_datasets_with_multiple_missing_variables() -> None:
vars_to_drop_in_first = ["temperature", "pressure"]
vars_to_drop_in_second = ["humidity", "precipitation", "cloud_cover"]
datasets = create_concat_datasets(2, seed=123)
expected = concat(datasets, dim="day")
for name in vars_to_drop_in_first:
expected[name][..., :2] = np.nan
for name in vars_to_drop_in_second:
expected[name][..., 2:] = np.nan
# set up the test data
datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
actual = concat(datasets, dim="day")
assert list(actual.data_vars.keys()) == [
"humidity",
"precipitation",
"cloud_cover",
"temperature",
"pressure",
]
assert_identical(actual, expected)
def test_concat_type_of_missing_fill() -> None:
datasets = create_typed_datasets(2, seed=123)
expected1 = concat(datasets, dim="day", fill_value=dtypes.NA)
expected2 = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
vars = ["float", "float2", "string", "int", "datetime64", "timedelta64"]
expected = [expected2, expected1]
for i, exp in enumerate(expected):
sl = slice(i * 2, (i + 1) * 2)
exp["float2"][..., sl] = np.nan
exp["datetime64"][..., sl] = np.nan
exp["timedelta64"][..., sl] = np.nan
var = exp["int"] * 1.0
var[..., sl] = np.nan
exp["int"] = var
var = exp["string"].astype(object)
var[..., sl] = np.nan
exp["string"] = var
# set up the test data
datasets[1] = datasets[1].drop_vars(vars[1:])
actual = concat(datasets, dim="day", fill_value=dtypes.NA)
assert_identical(actual, expected[1])
# reversed
actual = concat(datasets[::-1], dim="day", fill_value=dtypes.NA)
assert_identical(actual, expected[0])
def test_concat_order_when_filling_missing() -> None:
vars_to_drop_in_first: list[str] = []
# drop middle
vars_to_drop_in_second = ["humidity"]
datasets = create_concat_datasets(2, seed=123)
expected1 = concat(datasets, dim="day")
for name in vars_to_drop_in_second:
expected1[name][..., 2:] = np.nan
expected2 = concat(datasets[::-1], dim="day")
for name in vars_to_drop_in_second:
expected2[name][..., :2] = np.nan
# set up the test data
datasets[0] = datasets[0].drop_vars(vars_to_drop_in_first)
datasets[1] = datasets[1].drop_vars(vars_to_drop_in_second)
actual = concat(datasets, dim="day")
assert list(actual.data_vars.keys()) == [
"temperature",
"pressure",
"humidity",
"precipitation",
"cloud_cover",
]
assert_identical(actual, expected1)
actual = concat(datasets[::-1], dim="day")
assert list(actual.data_vars.keys()) == [
"temperature",
"pressure",
"precipitation",
"cloud_cover",
"humidity",
]
assert_identical(actual, expected2)
@pytest.fixture
def concat_var_names() -> Callable:
# create var names list with one missing value
def get_varnames(var_cnt: int = 10, list_cnt: int = 10) -> list[list[str]]:
orig = [f"d{i:02d}" for i in range(var_cnt)]
var_names = []
for _i in range(list_cnt):
l1 = orig.copy()
var_names.append(l1)
return var_names
return get_varnames
@pytest.fixture
def create_concat_ds() -> Callable:
def create_ds(
var_names: list[list[str]],
dim: bool = False,
coord: bool = False,
drop_idx: list[int] | None = None,
) -> list[Dataset]:
out_ds = []
ds = Dataset()
ds = ds.assign_coords({"x": np.arange(2)})
ds = ds.assign_coords({"y": np.arange(3)})
ds = ds.assign_coords({"z": np.arange(4)})
for i, dsl in enumerate(var_names):
vlist = dsl.copy()
if drop_idx is not None:
vlist.pop(drop_idx[i])
foo_data = np.arange(48, dtype=float).reshape(2, 2, 3, 4)
dsi = ds.copy()
if coord:
dsi = ds.assign({"time": (["time"], [i * 2, i * 2 + 1])})
for k in vlist:
dsi = dsi.assign({k: (["time", "x", "y", "z"], foo_data.copy())})
if not dim:
dsi = dsi.isel(time=0)
out_ds.append(dsi)
return out_ds
return create_ds
@pytest.mark.parametrize("dim", [True, False])
@pytest.mark.parametrize("coord", [True, False])
def test_concat_fill_missing_variables(
concat_var_names, create_concat_ds, dim: bool, coord: bool
) -> None:
var_names = concat_var_names()
drop_idx = [0, 7, 6, 4, 4, 8, 0, 6, 2, 0]
expected = concat(
create_concat_ds(var_names, dim=dim, coord=coord), dim="time", data_vars="all"
)
for i, idx in enumerate(drop_idx):
if dim:
expected[var_names[0][idx]][i * 2 : i * 2 + 2] = np.nan
else:
expected[var_names[0][idx]][i] = np.nan
concat_ds = create_concat_ds(var_names, dim=dim, coord=coord, drop_idx=drop_idx)
actual = concat(concat_ds, dim="time", data_vars="all")
assert list(actual.data_vars.keys()) == [
"d01",
"d02",
"d03",
"d04",
"d05",
"d06",
"d07",
"d08",
"d09",
"d00",
]
assert_identical(actual, expected)
class TestConcatDataset:
@pytest.fixture
def data(self, request) -> Dataset:
use_extension_array = request.param if hasattr(request, "param") else False
return create_test_data(use_extension_array=use_extension_array).drop_dims(
"dim3"
)
def rectify_dim_order(self, data: Dataset, dataset) -> Dataset:
# return a new dataset with all variable dimensions transposed into
# the order in which they are found in `data`
return Dataset(
{k: v.transpose(*data[k].dims) for k, v in dataset.data_vars.items()},
dataset.coords,
attrs=dataset.attrs,
)
@pytest.mark.parametrize("coords", ["different", "minimal"])
@pytest.mark.parametrize(
"dim,data", [["dim1", True], ["dim2", False]], indirect=["data"]
)
def test_concat_simple(self, data: Dataset, dim, coords) -> None:
datasets = [g for _, g in data.groupby(dim, squeeze=False)]
assert_identical(data, concat(datasets, dim, coords=coords))
def test_concat_merge_variables_present_in_some_datasets(
self, data: Dataset
) -> None:
# coordinates present in some datasets but not others
ds1 = Dataset(data_vars={"a": ("y", [0.1])}, coords={"x": 0.1})
ds2 = Dataset(data_vars={"a": ("y", [0.2])}, coords={"z": 0.2})
actual = concat([ds1, ds2], dim="y", coords="minimal")
expected = Dataset({"a": ("y", [0.1, 0.2])}, coords={"x": 0.1, "z": 0.2})
assert_identical(expected, actual)
# data variables present in some datasets but not others
split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))]
data0, data1 = deepcopy(split_data)
data1["foo"] = ("bar", np.random.randn(10))
actual = concat([data0, data1], "dim1", data_vars="minimal")
expected = data.copy().assign(foo=data1.foo)
assert_identical(expected, actual)
# expand foo
actual = concat([data0, data1], "dim1")
foo = np.ones((8, 10), dtype=data1.foo.dtype) * np.nan
foo[3:] = data1.foo.values[None, ...]
expected = data.copy().assign(foo=(["dim1", "bar"], foo))
assert_identical(expected, actual)
@pytest.mark.parametrize("data", [False], indirect=["data"])
def test_concat_2(self, data: Dataset) -> None:
dim = "dim2"
datasets = [g.squeeze(dim) for _, g in data.groupby(dim, squeeze=False)]
concat_over = [k for k, v in data.coords.items() if dim in v.dims and k != dim]
actual = concat(datasets, data[dim], coords=concat_over)
assert_identical(data, self.rectify_dim_order(data, actual))
@pytest.mark.parametrize("coords", ["different", "minimal", "all"])
@pytest.mark.parametrize("dim", ["dim1", "dim2"])
def test_concat_coords_kwarg(
self, data: Dataset, dim: str, coords: Literal["all", "minimal", "different"]
) -> None:
data = data.copy(deep=True)
# make sure the coords argument behaves as expected
data.coords["extra"] = ("dim4", np.arange(3))
datasets = [g.squeeze() for _, g in data.groupby(dim, squeeze=False)]
actual = concat(datasets, data[dim], coords=coords)
if coords == "all":
expected = np.array([data["extra"].values for _ in range(data.sizes[dim])])
assert_array_equal(actual["extra"].values, expected)
else:
assert_equal(data["extra"], actual["extra"])
def test_concat(self, data: Dataset) -> None:
split_data = [
data.isel(dim1=slice(3)),
data.isel(dim1=3),
data.isel(dim1=slice(4, None)),
]
assert_identical(data, concat(split_data, "dim1"))
def test_concat_dim_precedence(self, data: Dataset) -> None:
# verify that the dim argument takes precedence over
# concatenating dataset variables of the same name
dim = (2 * data["dim1"]).rename("dim1")
datasets = [g for _, g in data.groupby("dim1", squeeze=False)]
expected = data.copy()
expected["dim1"] = dim
assert_identical(expected, concat(datasets, dim))
def test_concat_data_vars_typing(self) -> None:
# Testing typing, can be removed if the next function works with annotations.
data = Dataset({"foo": ("x", np.random.randn(10))})
objs: list[Dataset] = [data.isel(x=slice(5)), data.isel(x=slice(5, None))]
actual = concat(objs, dim="x", data_vars="minimal")
assert_identical(data, actual)
def test_concat_data_vars(self) -> None:
data = Dataset({"foo": ("x", np.random.randn(10))})
objs: list[Dataset] = [data.isel(x=slice(5)), data.isel(x=slice(5, None))]
for data_vars in ["minimal", "different", "all", [], ["foo"]]:
actual = concat(objs, dim="x", data_vars=data_vars)
assert_identical(data, actual)
def test_concat_coords(self):
# TODO: annotating this func fails
data = Dataset({"foo": ("x", np.random.randn(10))})
expected = data.assign_coords(c=("x", [0] * 5 + [1] * 5))
objs = [
data.isel(x=slice(5)).assign_coords(c=0),
data.isel(x=slice(5, None)).assign_coords(c=1),
]
for coords in ["different", "all", ["c"]]:
actual = concat(objs, dim="x", coords=coords)
assert_identical(expected, actual)
for coords in ["minimal", []]:
with pytest.raises(merge.MergeError, match="conflicting values"):
concat(objs, dim="x", coords=coords)
def test_concat_constant_index(self):
# TODO: annotating this func fails
# GH425
ds1 = Dataset({"foo": 1.5}, {"y": 1})
ds2 = Dataset({"foo": 2.5}, {"y": 1})
expected = Dataset({"foo": ("y", [1.5, 2.5]), "y": [1, 1]})
for mode in ["different", "all", ["foo"]]:
actual = concat([ds1, ds2], "y", data_vars=mode)
assert_identical(expected, actual)
with pytest.raises(merge.MergeError, match="conflicting values"):
# previously dim="y", and raised error which makes no sense.
# "foo" has dimension "y" so minimal should concatenate it?
concat([ds1, ds2], "new_dim", data_vars="minimal")
def test_concat_size0(self) -> None:
data = create_test_data()
split_data = [data.isel(dim1=slice(0, 0)), data]
actual = concat(split_data, "dim1")
assert_identical(data, actual)
actual = concat(split_data[::-1], "dim1")
assert_identical(data, actual)
def test_concat_autoalign(self) -> None:
ds1 = Dataset({"foo": DataArray([1, 2], coords=[("x", [1, 2])])})
ds2 = Dataset({"foo": DataArray([1, 2], coords=[("x", [1, 3])])})
actual = concat([ds1, ds2], "y")
expected = Dataset(
{
"foo": DataArray(
[[1, 2, np.nan], [1, np.nan, 2]],
dims=["y", "x"],
coords={"x": [1, 2, 3]},
)
}
)
assert_identical(expected, actual)
def test_concat_errors(self):
# TODO: annotating this func fails
data = create_test_data()
split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))]
with pytest.raises(ValueError, match=r"must supply at least one"):
concat([], "dim1")
with pytest.raises(ValueError, match=r"Cannot specify both .*='different'"):
concat(
[data, data], dim="concat_dim", data_vars="different", compat="override"
)
with pytest.raises(ValueError, match=r"must supply at least one"):
concat([], "dim1")
with pytest.raises(ValueError, match=r"are not found in the coordinates"):
concat([data, data], "new_dim", coords=["not_found"])
with pytest.raises(ValueError, match=r"are not found in the data variables"):
concat([data, data], "new_dim", data_vars=["not_found"])
with pytest.raises(ValueError, match=r"global attributes not"):
# call deepcopy separately to get unique attrs
data0 = deepcopy(split_data[0])
data1 = deepcopy(split_data[1])
data1.attrs["foo"] = "bar"
concat([data0, data1], "dim1", compat="identical")
assert_identical(data, concat([data0, data1], "dim1", compat="equals"))
with pytest.raises(ValueError, match=r"compat.* invalid"):
concat(split_data, "dim1", compat="foobar")
with pytest.raises(ValueError, match=r"compat.* invalid"):
concat(split_data, "dim1", compat="minimal")
with pytest.raises(ValueError, match=r"unexpected value for"):
concat([data, data], "new_dim", coords="foobar")
with pytest.raises(
ValueError, match=r"coordinate in some datasets but not others"
):
concat([Dataset({"x": 0}), Dataset({"x": [1]})], dim="z")
with pytest.raises(
ValueError, match=r"coordinate in some datasets but not others"
):
concat([Dataset({"x": 0}), Dataset({}, {"x": 1})], dim="z")
def test_concat_join_kwarg(self) -> None:
ds1 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [0], "y": [0]})
ds2 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [1], "y": [0.0001]})
expected: dict[JoinOptions, Any] = {}
expected["outer"] = Dataset(
{"a": (("x", "y"), [[0, np.nan], [np.nan, 0]])},
{"x": [0, 1], "y": [0, 0.0001]},
)
expected["inner"] = Dataset(
{"a": (("x", "y"), [[], []])}, {"x": [0, 1], "y": []}
)
expected["left"] = Dataset(
{"a": (("x", "y"), np.array([0, np.nan], ndmin=2).T)},
coords={"x": [0, 1], "y": [0]},
)
expected["right"] = Dataset(
{"a": (("x", "y"), np.array([np.nan, 0], ndmin=2).T)},
coords={"x": [0, 1], "y": [0.0001]},
)
expected["override"] = Dataset(
{"a": (("x", "y"), np.array([0, 0], ndmin=2).T)},
coords={"x": [0, 1], "y": [0]},
)
with pytest.raises(ValueError, match=r"cannot align.*exact.*dimensions.*'y'"):
actual = concat([ds1, ds2], join="exact", dim="x")
for join in expected:
actual = concat([ds1, ds2], join=join, dim="x")
assert_equal(actual, expected[join])
# regression test for #3681
actual = concat(
[ds1.drop_vars("x"), ds2.drop_vars("x")], join="override", dim="y"
)
expected2 = Dataset(
{"a": (("x", "y"), np.array([0, 0], ndmin=2))}, coords={"y": [0, 0.0001]}
)
assert_identical(actual, expected2)
@pytest.mark.parametrize(
"combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": 41, "c": 43, "d": 44},
False,
),
(
lambda attrs, context: {"a": -1, "b": 0, "c": 1} if any(attrs) else {},
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": -1, "b": 0, "c": 1},
False,
),
],
)
def test_concat_combine_attrs_kwarg(
self, combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception
):
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs=var1_attrs)
ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs=var2_attrs)
if expect_exception:
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
else:
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
expected = Dataset(
{"a": ("x", [0, 0])}, {"x": [0, 1]}, attrs=expected_attrs
)
assert_identical(actual, expected)
@pytest.mark.parametrize(
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": 41, "c": 43, "d": 44},
False,
),
(
lambda attrs, context: {"a": -1, "b": 0, "c": 1} if any(attrs) else {},
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": -1, "b": 0, "c": 1},
False,
),
],
)
def test_concat_combine_attrs_kwarg_variables(
self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
):
"""check that combine_attrs is used on data variables and coords"""
ds1 = Dataset({"a": ("x", [0], attrs1)}, coords={"x": ("x", [0], attrs1)})
ds2 = Dataset({"a": ("x", [0], attrs2)}, coords={"x": ("x", [1], attrs2)})
if expect_exception:
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
else:
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
expected = Dataset(
{"a": ("x", [0, 0], expected_attrs)},
{"x": ("x", [0, 1], expected_attrs)},
)
assert_identical(actual, expected)
def test_concat_promote_shape(self) -> None:
# mixed dims within variables
objs = [Dataset({}, {"x": 0}), Dataset({"x": [1]})]
actual = concat(objs, "x")
expected = Dataset({"x": [0, 1]})
assert_identical(actual, expected)
objs = [Dataset({"x": [0]}), Dataset({}, {"x": 1})]
actual = concat(objs, "x")
assert_identical(actual, expected)
# mixed dims between variables
objs = [Dataset({"x": [2], "y": 3}), Dataset({"x": [4], "y": 5})]
actual = concat(objs, "x")
expected = Dataset({"x": [2, 4], "y": ("x", [3, 5])})
assert_identical(actual, expected)
# mixed dims in coord variable
objs = [Dataset({"x": [0]}, {"y": -1}), Dataset({"x": [1]}, {"y": ("x", [-2])})]
actual = concat(objs, "x")
expected = Dataset({"x": [0, 1]}, {"y": ("x", [-1, -2])})
assert_identical(actual, expected)
# scalars with mixed lengths along concat dim -- values should repeat
objs = [Dataset({"x": [0]}, {"y": -1}), Dataset({"x": [1, 2]}, {"y": -2})]
actual = concat(objs, "x")
expected = Dataset({"x": [0, 1, 2]}, {"y": ("x", [-1, -2, -2])})
assert_identical(actual, expected)
# broadcast 1d x 1d -> 2d
objs = [
Dataset({"z": ("x", [-1])}, {"x": [0], "y": [0]}),
Dataset({"z": ("y", [1])}, {"x": [1], "y": [0]}),
]
actual = concat(objs, "x")
expected = Dataset({"z": (("x", "y"), [[-1], [1]])}, {"x": [0, 1], "y": [0]})
assert_identical(actual, expected)
# regression GH6384
objs = [
Dataset({}, {"x": pd.Interval(-1, 0, closed="right")}),
Dataset({"x": [pd.Interval(0, 1, closed="right")]}),
]
actual = concat(objs, "x")
expected = Dataset(
{
"x": [
pd.Interval(-1, 0, closed="right"),
pd.Interval(0, 1, closed="right"),
]
}
)
assert_identical(actual, expected)
# regression GH6416 (coord dtype) and GH6434
time_data1 = np.array(["2022-01-01", "2022-02-01"], dtype="datetime64[ns]")
time_data2 = np.array("2022-03-01", dtype="datetime64[ns]")
time_expected = np.array(
["2022-01-01", "2022-02-01", "2022-03-01"], dtype="datetime64[ns]"
)
objs = [Dataset({}, {"time": time_data1}), Dataset({}, {"time": time_data2})]
actual = concat(objs, "time")
expected = Dataset({}, {"time": time_expected})
assert_identical(actual, expected)
assert isinstance(actual.indexes["time"], pd.DatetimeIndex)
def test_concat_do_not_promote(self) -> None:
# GH438
objs = [
Dataset({"y": ("t", [1])}, {"x": 1, "t": [0]}),
Dataset({"y": ("t", [2])}, {"x": 1, "t": [0]}),
]
expected = Dataset({"y": ("t", [1, 2])}, {"x": 1, "t": [0, 0]})
actual = concat(objs, "t")
assert_identical(expected, actual)
objs = [
Dataset({"y": ("t", [1])}, {"x": 1, "t": [0]}),
Dataset({"y": ("t", [2])}, {"x": 2, "t": [0]}),
]
with pytest.raises(ValueError):
concat(objs, "t", coords="minimal")
def test_concat_dim_is_variable(self) -> None:
objs = [Dataset({"x": 0}), Dataset({"x": 1})]
coord = Variable("y", [3, 4], attrs={"foo": "bar"})
expected = Dataset({"x": ("y", [0, 1]), "y": coord})
actual = concat(objs, coord)
assert_identical(actual, expected)
def test_concat_dim_is_dataarray(self) -> None:
objs = [Dataset({"x": 0}), Dataset({"x": 1})]
coord = DataArray([3, 4], dims="y", attrs={"foo": "bar"})
expected = Dataset({"x": ("y", [0, 1]), "y": coord})
actual = concat(objs, coord)
assert_identical(actual, expected)
def test_concat_multiindex(self) -> None:
midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]])
midx_coords = Coordinates.from_pandas_multiindex(midx, "x")
expected = Dataset(coords=midx_coords)
actual = concat(
[expected.isel(x=slice(2)), expected.isel(x=slice(2, None))], "x"
)
assert expected.equals(actual)
assert isinstance(actual.x.to_index(), pd.MultiIndex)
def test_concat_along_new_dim_multiindex(self) -> None:
# see https://github.com/pydata/xarray/issues/6881
level_names = ["x_level_0", "x_level_1"]
midx = pd.MultiIndex.from_product([[1, 2, 3], ["a", "b"]], names=level_names)
midx_coords = Coordinates.from_pandas_multiindex(midx, "x")
ds = Dataset(coords=midx_coords)
concatenated = concat([ds], "new")
actual = list(concatenated.xindexes.get_all_coords("x"))
expected = ["x"] + level_names
assert actual == expected
@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"a": 2, "b": 1}])
def test_concat_fill_value(self, fill_value) -> None:
datasets = [
Dataset({"a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1]}),
]
if fill_value == dtypes.NA:
# if we supply the default, we expect the missing value for a
# float array
fill_value_a = fill_value_b = np.nan
elif isinstance(fill_value, dict):
fill_value_a = fill_value["a"]
fill_value_b = fill_value["b"]
else:
fill_value_a = fill_value_b = fill_value
expected = Dataset(
{
"a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]),
"b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]),
},
{"x": [0, 1, 2]},
)
actual = concat(datasets, dim="t", fill_value=fill_value)
assert_identical(actual, expected)
@pytest.mark.parametrize("dtype", [str, bytes])
@pytest.mark.parametrize("dim", ["x1", "x2"])
def test_concat_str_dtype(self, dtype, dim) -> None:
data = np.arange(4).reshape([2, 2])
da1 = Dataset(
{
"data": (["x1", "x2"], data),
"x1": [0, 1],
"x2": np.array(["a", "b"], dtype=dtype),
}
)
da2 = Dataset(
{
"data": (["x1", "x2"], data),
"x1": np.array([1, 2]),
"x2": np.array(["c", "d"], dtype=dtype),
}
)
actual = concat([da1, da2], dim=dim)
assert np.issubdtype(actual.x2.dtype, dtype)
def test_concat_avoids_index_auto_creation(self) -> None:
# TODO once passing indexes={} directly to Dataset constructor is allowed then no need to create coords first
coords = Coordinates(
{"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={}
)
datasets = [
Dataset(
{"a": (["x", "y"], ConcatenatableArray(np.zeros((3, 3))))},
coords=coords,
)
for _ in range(2)
]
# should not raise on concat
combined = concat(datasets, dim="x")
assert combined["a"].shape == (6, 3)
assert combined["a"].dims == ("x", "y")
# nor have auto-created any indexes
assert combined.indexes == {}
# should not raise on stack
combined = concat(datasets, dim="z")
assert combined["a"].shape == (2, 3, 3)
assert combined["a"].dims == ("z", "x", "y")
# nor have auto-created any indexes
assert combined.indexes == {}
def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None:
# create 0D coordinates (without indexes)
datasets = [
Dataset(
coords={"x": ConcatenatableArray(np.array(10))},
)
for _ in range(2)
]
with pytest.raises(UnexpectedDataAccess):
concat(datasets, dim="x", create_index_for_new_dim=True)
# should not raise on concat iff create_index_for_new_dim=False
combined = concat(datasets, dim="x", create_index_for_new_dim=False)
assert combined["x"].shape == (2,)
assert combined["x"].dims == ("x",)
# nor have auto-created any indexes
assert combined.indexes == {}
def test_concat_promote_shape_without_creating_new_index(self) -> None:
# different shapes but neither have indexes
ds1 = Dataset(coords={"x": 0})
ds2 = Dataset(data_vars={"x": [1]}).drop_indexes("x")
actual = concat([ds1, ds2], dim="x", create_index_for_new_dim=False)
expected = Dataset(data_vars={"x": [0, 1]}).drop_indexes("x")
assert_identical(actual, expected, check_default_indexes=False)
assert actual.indexes == {}
class TestConcatDataArray:
def test_concat(self) -> None:
ds = Dataset(
{
"foo": (["x", "y"], np.random.random((2, 3))),
"bar": (["x", "y"], np.random.random((2, 3))),
},
{"x": [0, 1]},
)
foo = ds["foo"]
bar = ds["bar"]
# from dataset array:
expected = DataArray(
np.array([foo.values, bar.values]),
dims=["w", "x", "y"],
coords={"x": [0, 1]},
)
actual = concat([foo, bar], "w")
assert_equal(expected, actual)
# from iteration:
grouped = [g.squeeze() for _, g in foo.groupby("x", squeeze=False)]
stacked = concat(grouped, ds["x"])
assert_identical(foo, stacked)
# with an index as the 'dim' argument
stacked = concat(grouped, pd.Index(ds["x"], name="x"))
assert_identical(foo, stacked)
actual2 = concat([foo[0], foo[1]], pd.Index([0, 1])).reset_coords(drop=True)
expected = foo[:2].rename({"x": "concat_dim"})
assert_identical(expected, actual2)
actual3 = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True)
expected = foo[:2].rename({"x": "concat_dim"})
assert_identical(expected, actual3)
with pytest.raises(ValueError, match=r"not identical"):
concat([foo, bar], dim="w", compat="identical")
with pytest.raises(ValueError, match=r"not a valid argument"):
concat([foo, bar], dim="w", data_vars="minimal")
def test_concat_encoding(self) -> None:
# Regression test for GH1297
ds = Dataset(
{
"foo": (["x", "y"], np.random.random((2, 3))),
"bar": (["x", "y"], np.random.random((2, 3))),
},
{"x": [0, 1]},
)
foo = ds["foo"]
foo.encoding = {"complevel": 5}
ds.encoding = {"unlimited_dims": "x"}
assert concat([foo, foo], dim="x").encoding == foo.encoding
assert concat([ds, ds], dim="x").encoding == ds.encoding
@requires_dask
def test_concat_lazy(self) -> None:
import dask.array as da
arrays = [
DataArray(
da.from_array(InaccessibleArray(np.zeros((3, 3))), 3), dims=["x", "y"]
)
for _ in range(2)
]
# should not raise
combined = concat(arrays, dim="z")
assert combined.shape == (2, 3, 3)
assert combined.dims == ("z", "x", "y")
def test_concat_avoids_index_auto_creation(self) -> None:
# TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first
coords = Coordinates(
{"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={}
)
arrays = [
DataArray(
ConcatenatableArray(np.zeros((3, 3))),
dims=["x", "y"],
coords=coords,
)
for _ in range(2)
]
# should not raise on concat
combined = concat(arrays, dim="x")
assert combined.shape == (6, 3)
assert combined.dims == ("x", "y")
# nor have auto-created any indexes
assert combined.indexes == {}
# should not raise on stack
combined = concat(arrays, dim="z")
assert combined.shape == (2, 3, 3)
assert combined.dims == ("z", "x", "y")
# nor have auto-created any indexes
assert combined.indexes == {}
@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0])
def test_concat_fill_value(self, fill_value) -> None:
foo = DataArray([1, 2], coords=[("x", [1, 2])])
bar = DataArray([1, 2], coords=[("x", [1, 3])])
if fill_value == dtypes.NA:
# if we supply the default, we expect the missing value for a
# float array
fill_value = np.nan
expected = DataArray(
[[1, 2, fill_value], [1, fill_value, 2]],
dims=["y", "x"],
coords={"x": [1, 2, 3]},
)
actual = concat((foo, bar), dim="y", fill_value=fill_value)
assert_identical(actual, expected)
def test_concat_join_kwarg(self) -> None:
ds1 = Dataset(
{"a": (("x", "y"), [[0]])}, coords={"x": [0], "y": [0]}
).to_dataarray()
ds2 = Dataset(
{"a": (("x", "y"), [[0]])}, coords={"x": [1], "y": [0.0001]}
).to_dataarray()
expected: dict[JoinOptions, Any] = {}
expected["outer"] = Dataset(
{"a": (("x", "y"), [[0, np.nan], [np.nan, 0]])},
{"x": [0, 1], "y": [0, 0.0001]},
)
expected["inner"] = Dataset(
{"a": (("x", "y"), [[], []])}, {"x": [0, 1], "y": []}
)
expected["left"] = Dataset(
{"a": (("x", "y"), np.array([0, np.nan], ndmin=2).T)},
coords={"x": [0, 1], "y": [0]},
)
expected["right"] = Dataset(
{"a": (("x", "y"), np.array([np.nan, 0], ndmin=2).T)},
coords={"x": [0, 1], "y": [0.0001]},
)
expected["override"] = Dataset(
{"a": (("x", "y"), np.array([0, 0], ndmin=2).T)},
coords={"x": [0, 1], "y": [0]},
)
with pytest.raises(ValueError, match=r"cannot align.*exact.*dimensions.*'y'"):
actual = concat([ds1, ds2], join="exact", dim="x")
for join in expected:
actual = concat([ds1, ds2], join=join, dim="x")
assert_equal(actual, expected[join].to_dataarray())
def test_concat_combine_attrs_kwarg(self) -> None:
da1 = DataArray([0], coords=[("x", [0])], attrs={"b": 42})
da2 = DataArray([0], coords=[("x", [1])], attrs={"b": 42, "c": 43})
expected: dict[CombineAttrsOptions, Any] = {}
expected["drop"] = DataArray([0, 0], coords=[("x", [0, 1])])
expected["no_conflicts"] = DataArray(
[0, 0], coords=[("x", [0, 1])], attrs={"b": 42, "c": 43}
)
expected["override"] = DataArray(
[0, 0], coords=[("x", [0, 1])], attrs={"b": 42}
)
with pytest.raises(ValueError, match=r"combine_attrs='identical'"):
actual = concat([da1, da2], dim="x", combine_attrs="identical")
with pytest.raises(ValueError, match=r"combine_attrs='no_conflicts'"):
da3 = da2.copy(deep=True)
da3.attrs["b"] = 44
actual = concat([da1, da3], dim="x", combine_attrs="no_conflicts")
for combine_attrs in expected:
actual = concat([da1, da2], dim="x", combine_attrs=combine_attrs)
assert_identical(actual, expected[combine_attrs])
@pytest.mark.parametrize("dtype", [str, bytes])
@pytest.mark.parametrize("dim", ["x1", "x2"])
def test_concat_str_dtype(self, dtype, dim) -> None:
data = np.arange(4).reshape([2, 2])
da1 = DataArray(
data=data,
dims=["x1", "x2"],
coords={"x1": [0, 1], "x2": np.array(["a", "b"], dtype=dtype)},
)
da2 = DataArray(
data=data,
dims=["x1", "x2"],
coords={"x1": np.array([1, 2]), "x2": np.array(["c", "d"], dtype=dtype)},
)
actual = concat([da1, da2], dim=dim)
assert np.issubdtype(actual.x2.dtype, dtype)
def test_concat_coord_name(self) -> None:
da = DataArray([0], dims="a")
da_concat = concat([da, da], dim=DataArray([0, 1], dims="b"))
assert list(da_concat.coords) == ["b"]
da_concat_std = concat([da, da], dim=DataArray([0, 1]))
assert list(da_concat_std.coords) == ["dim_0"]
@pytest.mark.parametrize("attr1", ({"a": {"meta": [10, 20, 30]}}, {"a": [1, 2, 3]}, {}))
@pytest.mark.parametrize("attr2", ({"a": [1, 2, 3]}, {}))
def test_concat_attrs_first_variable(attr1, attr2) -> None:
arrs = [
DataArray([[1], [2]], dims=["x", "y"], attrs=attr1),
DataArray([[3], [4]], dims=["x", "y"], attrs=attr2),
]
concat_attrs = concat(arrs, "y").attrs
assert concat_attrs == attr1
def test_concat_merge_single_non_dim_coord():
# TODO: annotating this func fails
da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1})
da2 = DataArray([4, 5, 6], dims="x", coords={"x": [4, 5, 6]})
expected = DataArray(range(1, 7), dims="x", coords={"x": range(1, 7), "y": 1})
for coords in ["different", "minimal"]:
actual = concat([da1, da2], "x", coords=coords)
assert_identical(actual, expected)
with pytest.raises(ValueError, match=r"'y' not present in all datasets."):
concat([da1, da2], dim="x", coords="all")
da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1})
da2 = DataArray([4, 5, 6], dims="x", coords={"x": [4, 5, 6]})
da3 = DataArray([7, 8, 9], dims="x", coords={"x": [7, 8, 9], "y": 1})
for coords in ["different", "all"]:
with pytest.raises(ValueError, match=r"'y' not present in all datasets"):
concat([da1, da2, da3], dim="x", coords=coords)
def test_concat_preserve_coordinate_order() -> None:
x = np.arange(0, 5)
y = np.arange(0, 10)
time = np.arange(0, 4)
data = np.zeros((4, 10, 5), dtype=bool)
ds1 = Dataset(
{"data": (["time", "y", "x"], data[0:2])},
coords={"time": time[0:2], "y": y, "x": x},
)
ds2 = Dataset(
{"data": (["time", "y", "x"], data[2:4])},
coords={"time": time[2:4], "y": y, "x": x},
)
expected = Dataset(
{"data": (["time", "y", "x"], data)},
coords={"time": time, "y": y, "x": x},
)
actual = concat([ds1, ds2], dim="time")
# check dimension order
for act, exp in zip(actual.dims, expected.dims, strict=True):
assert act == exp
assert actual.sizes[act] == expected.sizes[exp]
# check coordinate order
for act, exp in zip(actual.coords, expected.coords, strict=True):
assert act == exp
assert_identical(actual.coords[act], expected.coords[exp])
def test_concat_typing_check() -> None:
ds = Dataset({"foo": 1}, {"bar": 2})
da = Dataset({"foo": 3}, {"bar": 4}).to_dataarray(dim="foo")
# concatenate a list of non-homogeneous types must raise TypeError
with pytest.raises(
TypeError,
match="The elements in the input list need to be either all 'Dataset's or all 'DataArray's",
):
concat([ds, da], dim="foo") # type: ignore[type-var]
with pytest.raises(
TypeError,
match="The elements in the input list need to be either all 'Dataset's or all 'DataArray's",
):
concat([da, ds], dim="foo") # type: ignore[type-var]
def test_concat_not_all_indexes() -> None:
ds1 = Dataset(coords={"x": ("x", [1, 2])})
# ds2.x has no default index
ds2 = Dataset(coords={"x": ("y", [3, 4])})
with pytest.raises(
ValueError, match=r"'x' must have either an index or no index in all datasets.*"
):
concat([ds1, ds2], dim="x")
def test_concat_index_not_same_dim() -> None:
ds1 = Dataset(coords={"x": ("x", [1, 2])})
ds2 = Dataset(coords={"x": ("y", [3, 4])})
# TODO: use public API for setting a non-default index, when available
ds2._indexes["x"] = PandasIndex([3, 4], "y")
with pytest.raises(
ValueError,
match=r"Cannot concatenate along dimension 'x' indexes with dimensions.*",
):
concat([ds1, ds2], dim="x")