CCR/.venv/lib/python3.12/site-packages/xarray/tests/test_combine.py

1186 lines
44 KiB
Python

from __future__ import annotations
from itertools import product
import numpy as np
import pytest
from xarray import (
DataArray,
Dataset,
MergeError,
combine_by_coords,
combine_nested,
concat,
merge,
)
from xarray.core import dtypes
from xarray.core.combine import (
_check_shape_tile_ids,
_combine_all_along_first_dim,
_combine_nd,
_infer_concat_order_from_coords,
_infer_concat_order_from_positions,
_new_tile_id,
)
from xarray.tests import assert_equal, assert_identical, requires_cftime
from xarray.tests.test_dataset import create_test_data
def assert_combined_tile_ids_equal(dict1, dict2):
assert len(dict1) == len(dict2)
for k in dict1.keys():
assert k in dict2.keys()
assert_equal(dict1[k], dict2[k])
class TestTileIDsFromNestedList:
def test_1d(self):
ds = create_test_data
input = [ds(0), ds(1)]
expected = {(0,): ds(0), (1,): ds(1)}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_2d(self):
ds = create_test_data
input = [[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]]
expected = {
(0, 0): ds(0),
(0, 1): ds(1),
(1, 0): ds(2),
(1, 1): ds(3),
(2, 0): ds(4),
(2, 1): ds(5),
}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_3d(self):
ds = create_test_data
input = [
[[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]],
[[ds(6), ds(7)], [ds(8), ds(9)], [ds(10), ds(11)]],
]
expected = {
(0, 0, 0): ds(0),
(0, 0, 1): ds(1),
(0, 1, 0): ds(2),
(0, 1, 1): ds(3),
(0, 2, 0): ds(4),
(0, 2, 1): ds(5),
(1, 0, 0): ds(6),
(1, 0, 1): ds(7),
(1, 1, 0): ds(8),
(1, 1, 1): ds(9),
(1, 2, 0): ds(10),
(1, 2, 1): ds(11),
}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_single_dataset(self):
ds = create_test_data(0)
input = [ds]
expected = {(0,): ds}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_redundant_nesting(self):
ds = create_test_data
input = [[ds(0)], [ds(1)]]
expected = {(0, 0): ds(0), (1, 0): ds(1)}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_ignore_empty_list(self):
ds = create_test_data(0)
input = [ds, []]
expected = {(0,): ds}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_uneven_depth_input(self):
# Auto_combine won't work on ragged input
# but this is just to increase test coverage
ds = create_test_data
input = [ds(0), [ds(1), ds(2)]]
expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_uneven_length_input(self):
# Auto_combine won't work on ragged input
# but this is just to increase test coverage
ds = create_test_data
input = [[ds(0)], [ds(1), ds(2)]]
expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
def test_infer_from_datasets(self):
ds = create_test_data
input = [ds(0), ds(1)]
expected = {(0,): ds(0), (1,): ds(1)}
actual = _infer_concat_order_from_positions(input)
assert_combined_tile_ids_equal(expected, actual)
class TestTileIDsFromCoords:
def test_1d(self):
ds0 = Dataset({"x": [0, 1]})
ds1 = Dataset({"x": [2, 3]})
expected = {(0,): ds0, (1,): ds1}
actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0])
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == ["x"]
def test_2d(self):
ds0 = Dataset({"x": [0, 1], "y": [10, 20, 30]})
ds1 = Dataset({"x": [2, 3], "y": [10, 20, 30]})
ds2 = Dataset({"x": [0, 1], "y": [40, 50, 60]})
ds3 = Dataset({"x": [2, 3], "y": [40, 50, 60]})
ds4 = Dataset({"x": [0, 1], "y": [70, 80, 90]})
ds5 = Dataset({"x": [2, 3], "y": [70, 80, 90]})
expected = {
(0, 0): ds0,
(1, 0): ds1,
(0, 1): ds2,
(1, 1): ds3,
(0, 2): ds4,
(1, 2): ds5,
}
actual, concat_dims = _infer_concat_order_from_coords(
[ds1, ds0, ds3, ds5, ds2, ds4]
)
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == ["x", "y"]
def test_no_dimension_coords(self):
ds0 = Dataset({"foo": ("x", [0, 1])})
ds1 = Dataset({"foo": ("x", [2, 3])})
with pytest.raises(ValueError, match=r"Could not find any dimension"):
_infer_concat_order_from_coords([ds1, ds0])
def test_coord_not_monotonic(self):
ds0 = Dataset({"x": [0, 1]})
ds1 = Dataset({"x": [3, 2]})
with pytest.raises(
ValueError,
match=r"Coordinate variable x is neither monotonically increasing nor",
):
_infer_concat_order_from_coords([ds1, ds0])
def test_coord_monotonically_decreasing(self):
ds0 = Dataset({"x": [3, 2]})
ds1 = Dataset({"x": [1, 0]})
expected = {(0,): ds0, (1,): ds1}
actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0])
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == ["x"]
def test_no_concatenation_needed(self):
ds = Dataset({"foo": ("x", [0, 1])})
expected = {(): ds}
actual, concat_dims = _infer_concat_order_from_coords([ds])
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == []
def test_2d_plus_bystander_dim(self):
ds0 = Dataset({"x": [0, 1], "y": [10, 20, 30], "t": [0.1, 0.2]})
ds1 = Dataset({"x": [2, 3], "y": [10, 20, 30], "t": [0.1, 0.2]})
ds2 = Dataset({"x": [0, 1], "y": [40, 50, 60], "t": [0.1, 0.2]})
ds3 = Dataset({"x": [2, 3], "y": [40, 50, 60], "t": [0.1, 0.2]})
expected = {(0, 0): ds0, (1, 0): ds1, (0, 1): ds2, (1, 1): ds3}
actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0, ds3, ds2])
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == ["x", "y"]
def test_string_coords(self):
ds0 = Dataset({"person": ["Alice", "Bob"]})
ds1 = Dataset({"person": ["Caroline", "Daniel"]})
expected = {(0,): ds0, (1,): ds1}
actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0])
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == ["person"]
# Decided against natural sorting of string coords GH #2616
def test_lexicographic_sort_string_coords(self):
ds0 = Dataset({"simulation": ["run8", "run9"]})
ds1 = Dataset({"simulation": ["run10", "run11"]})
expected = {(0,): ds1, (1,): ds0}
actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0])
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == ["simulation"]
def test_datetime_coords(self):
ds0 = Dataset(
{"time": np.array(["2000-03-06", "2000-03-07"], dtype="datetime64[ns]")}
)
ds1 = Dataset(
{"time": np.array(["1999-01-01", "1999-02-04"], dtype="datetime64[ns]")}
)
expected = {(0,): ds1, (1,): ds0}
actual, concat_dims = _infer_concat_order_from_coords([ds0, ds1])
assert_combined_tile_ids_equal(expected, actual)
assert concat_dims == ["time"]
@pytest.fixture(scope="module")
def create_combined_ids():
return _create_combined_ids
def _create_combined_ids(shape):
tile_ids = _create_tile_ids(shape)
nums = range(len(tile_ids))
return {
tile_id: create_test_data(num)
for tile_id, num in zip(tile_ids, nums, strict=True)
}
def _create_tile_ids(shape):
tile_ids = product(*(range(i) for i in shape))
return list(tile_ids)
class TestNewTileIDs:
@pytest.mark.parametrize(
"old_id, new_id",
[((3, 0, 1), (0, 1)), ((0, 0), (0,)), ((1,), ()), ((0,), ()), ((1, 0), (0,))],
)
def test_new_tile_id(self, old_id, new_id):
ds = create_test_data
assert _new_tile_id((old_id, ds)) == new_id
def test_get_new_tile_ids(self, create_combined_ids):
shape = (1, 2, 3)
combined_ids = create_combined_ids(shape)
expected_tile_ids = sorted(combined_ids.keys())
actual_tile_ids = _create_tile_ids(shape)
assert expected_tile_ids == actual_tile_ids
class TestCombineND:
@pytest.mark.parametrize("concat_dim", ["dim1", "new_dim"])
def test_concat_once(self, create_combined_ids, concat_dim):
shape = (2,)
combined_ids = create_combined_ids(shape)
ds = create_test_data
result = _combine_all_along_first_dim(
combined_ids,
dim=concat_dim,
data_vars="all",
coords="different",
compat="no_conflicts",
)
expected_ds = concat([ds(0), ds(1)], dim=concat_dim)
assert_combined_tile_ids_equal(result, {(): expected_ds})
def test_concat_only_first_dim(self, create_combined_ids):
shape = (2, 3)
combined_ids = create_combined_ids(shape)
result = _combine_all_along_first_dim(
combined_ids,
dim="dim1",
data_vars="all",
coords="different",
compat="no_conflicts",
)
ds = create_test_data
partway1 = concat([ds(0), ds(3)], dim="dim1")
partway2 = concat([ds(1), ds(4)], dim="dim1")
partway3 = concat([ds(2), ds(5)], dim="dim1")
expected_datasets = [partway1, partway2, partway3]
expected = {(i,): ds for i, ds in enumerate(expected_datasets)}
assert_combined_tile_ids_equal(result, expected)
@pytest.mark.parametrize("concat_dim", ["dim1", "new_dim"])
def test_concat_twice(self, create_combined_ids, concat_dim):
shape = (2, 3)
combined_ids = create_combined_ids(shape)
result = _combine_nd(combined_ids, concat_dims=["dim1", concat_dim])
ds = create_test_data
partway1 = concat([ds(0), ds(3)], dim="dim1")
partway2 = concat([ds(1), ds(4)], dim="dim1")
partway3 = concat([ds(2), ds(5)], dim="dim1")
expected = concat([partway1, partway2, partway3], dim=concat_dim)
assert_equal(result, expected)
class TestCheckShapeTileIDs:
def test_check_depths(self):
ds = create_test_data(0)
combined_tile_ids = {(0,): ds, (0, 1): ds}
with pytest.raises(
ValueError, match=r"sub-lists do not have consistent depths"
):
_check_shape_tile_ids(combined_tile_ids)
def test_check_lengths(self):
ds = create_test_data(0)
combined_tile_ids = {(0, 0): ds, (0, 1): ds, (0, 2): ds, (1, 0): ds, (1, 1): ds}
with pytest.raises(
ValueError, match=r"sub-lists do not have consistent lengths"
):
_check_shape_tile_ids(combined_tile_ids)
class TestNestedCombine:
def test_nested_concat(self):
objs = [Dataset({"x": [0]}), Dataset({"x": [1]})]
expected = Dataset({"x": [0, 1]})
actual = combine_nested(objs, concat_dim="x")
assert_identical(expected, actual)
actual = combine_nested(objs, concat_dim=["x"])
assert_identical(expected, actual)
actual = combine_nested([actual], concat_dim=None)
assert_identical(expected, actual)
actual = combine_nested([actual], concat_dim="x")
assert_identical(expected, actual)
objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2]})]
actual = combine_nested(objs, concat_dim="x")
expected = Dataset({"x": [0, 1, 2]})
assert_identical(expected, actual)
# ensure combine_nested handles non-sorted variables
objs = [
Dataset({"x": ("a", [0]), "y": ("a", [0])}),
Dataset({"y": ("a", [1]), "x": ("a", [1])}),
]
actual = combine_nested(objs, concat_dim="a")
expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1])})
assert_identical(expected, actual)
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1]})]
actual = combine_nested(objs, concat_dim="x")
expected = Dataset({"x": [0, 1], "y": [0]})
assert_identical(expected, actual)
@pytest.mark.parametrize(
"join, expected",
[
("outer", Dataset({"x": [0, 1], "y": [0, 1]})),
("inner", Dataset({"x": [0, 1], "y": []})),
("left", Dataset({"x": [0, 1], "y": [0]})),
("right", Dataset({"x": [0, 1], "y": [1]})),
],
)
def test_combine_nested_join(self, join, expected):
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})]
actual = combine_nested(objs, concat_dim="x", join=join)
assert_identical(expected, actual)
def test_combine_nested_join_exact(self):
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})]
with pytest.raises(ValueError, match=r"cannot align.*join.*exact"):
combine_nested(objs, concat_dim="x", join="exact")
def test_empty_input(self):
assert_identical(Dataset(), combine_nested([], concat_dim="x"))
# Fails because of concat's weird treatment of dimension coords, see #2975
@pytest.mark.xfail
def test_nested_concat_too_many_dims_at_once(self):
objs = [Dataset({"x": [0], "y": [1]}), Dataset({"y": [0], "x": [1]})]
with pytest.raises(ValueError, match="not equal across datasets"):
combine_nested(objs, concat_dim="x", coords="minimal")
def test_nested_concat_along_new_dim(self):
objs = [
Dataset({"a": ("x", [10]), "x": [0]}),
Dataset({"a": ("x", [20]), "x": [0]}),
]
expected = Dataset({"a": (("t", "x"), [[10], [20]]), "x": [0]})
actual = combine_nested(objs, concat_dim="t")
assert_identical(expected, actual)
# Same but with a DataArray as new dim, see GH #1988 and #2647
dim = DataArray([100, 150], name="baz", dims="baz")
expected = Dataset(
{"a": (("baz", "x"), [[10], [20]]), "x": [0], "baz": [100, 150]}
)
actual = combine_nested(objs, concat_dim=dim)
assert_identical(expected, actual)
def test_nested_merge(self):
data = Dataset({"x": 0})
actual = combine_nested([data, data, data], concat_dim=None)
assert_identical(data, actual)
ds1 = Dataset({"a": ("x", [1, 2]), "x": [0, 1]})
ds2 = Dataset({"a": ("x", [2, 3]), "x": [1, 2]})
expected = Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]})
actual = combine_nested([ds1, ds2], concat_dim=None)
assert_identical(expected, actual)
actual = combine_nested([ds1, ds2], concat_dim=[None])
assert_identical(expected, actual)
tmp1 = Dataset({"x": 0})
tmp2 = Dataset({"x": np.nan})
actual = combine_nested([tmp1, tmp2], concat_dim=None)
assert_identical(tmp1, actual)
actual = combine_nested([tmp1, tmp2], concat_dim=[None])
assert_identical(tmp1, actual)
# Single object, with a concat_dim explicitly provided
# Test the issue reported in GH #1988
objs = [Dataset({"x": 0, "y": 1})]
dim = DataArray([100], name="baz", dims="baz")
actual = combine_nested(objs, concat_dim=[dim])
expected = Dataset({"x": ("baz", [0]), "y": ("baz", [1])}, {"baz": [100]})
assert_identical(expected, actual)
# Just making sure that auto_combine is doing what is
# expected for non-scalar values, too.
objs = [Dataset({"x": ("z", [0, 1]), "y": ("z", [1, 2])})]
dim = DataArray([100], name="baz", dims="baz")
actual = combine_nested(objs, concat_dim=[dim])
expected = Dataset(
{"x": (("baz", "z"), [[0, 1]]), "y": (("baz", "z"), [[1, 2]])},
{"baz": [100]},
)
assert_identical(expected, actual)
def test_concat_multiple_dims(self):
objs = [
[Dataset({"a": (("x", "y"), [[0]])}), Dataset({"a": (("x", "y"), [[1]])})],
[Dataset({"a": (("x", "y"), [[2]])}), Dataset({"a": (("x", "y"), [[3]])})],
]
actual = combine_nested(objs, concat_dim=["x", "y"])
expected = Dataset({"a": (("x", "y"), [[0, 1], [2, 3]])})
assert_identical(expected, actual)
def test_concat_name_symmetry(self):
"""Inspired by the discussion on GH issue #2777"""
da1 = DataArray(name="a", data=[[0]], dims=["x", "y"])
da2 = DataArray(name="b", data=[[1]], dims=["x", "y"])
da3 = DataArray(name="a", data=[[2]], dims=["x", "y"])
da4 = DataArray(name="b", data=[[3]], dims=["x", "y"])
x_first = combine_nested([[da1, da2], [da3, da4]], concat_dim=["x", "y"])
y_first = combine_nested([[da1, da3], [da2, da4]], concat_dim=["y", "x"])
assert_identical(x_first, y_first)
def test_concat_one_dim_merge_another(self):
data = create_test_data(add_attrs=False)
data1 = data.copy(deep=True)
data2 = data.copy(deep=True)
objs = [
[data1.var1.isel(dim2=slice(4)), data2.var1.isel(dim2=slice(4, 9))],
[data1.var2.isel(dim2=slice(4)), data2.var2.isel(dim2=slice(4, 9))],
]
expected = data[["var1", "var2"]]
actual = combine_nested(objs, concat_dim=[None, "dim2"])
assert_identical(expected, actual)
def test_auto_combine_2d(self):
ds = create_test_data
partway1 = concat([ds(0), ds(3)], dim="dim1")
partway2 = concat([ds(1), ds(4)], dim="dim1")
partway3 = concat([ds(2), ds(5)], dim="dim1")
expected = concat([partway1, partway2, partway3], dim="dim2")
datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]]
result = combine_nested(datasets, concat_dim=["dim1", "dim2"])
assert_equal(result, expected)
def test_auto_combine_2d_combine_attrs_kwarg(self):
ds = lambda x: create_test_data(x, add_attrs=False)
partway1 = concat([ds(0), ds(3)], dim="dim1")
partway2 = concat([ds(1), ds(4)], dim="dim1")
partway3 = concat([ds(2), ds(5)], dim="dim1")
expected = concat([partway1, partway2, partway3], dim="dim2")
expected_dict = {}
expected_dict["drop"] = expected.copy(deep=True)
expected_dict["drop"].attrs = {}
expected_dict["no_conflicts"] = expected.copy(deep=True)
expected_dict["no_conflicts"].attrs = {
"a": 1,
"b": 2,
"c": 3,
"d": 4,
"e": 5,
"f": 6,
}
expected_dict["override"] = expected.copy(deep=True)
expected_dict["override"].attrs = {"a": 1}
f = lambda attrs, context: attrs[0]
expected_dict[f] = expected.copy(deep=True)
expected_dict[f].attrs = f([{"a": 1}], None)
datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]]
datasets[0][0].attrs = {"a": 1}
datasets[0][1].attrs = {"a": 1, "b": 2}
datasets[0][2].attrs = {"a": 1, "c": 3}
datasets[1][0].attrs = {"a": 1, "d": 4}
datasets[1][1].attrs = {"a": 1, "e": 5}
datasets[1][2].attrs = {"a": 1, "f": 6}
with pytest.raises(ValueError, match=r"combine_attrs='identical'"):
result = combine_nested(
datasets, concat_dim=["dim1", "dim2"], combine_attrs="identical"
)
for combine_attrs in expected_dict:
result = combine_nested(
datasets, concat_dim=["dim1", "dim2"], combine_attrs=combine_attrs
)
assert_identical(result, expected_dict[combine_attrs])
def test_combine_nested_missing_data_new_dim(self):
# Your data includes "time" and "station" dimensions, and each year's
# data has a different set of stations.
datasets = [
Dataset({"a": ("x", [2, 3]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "x": [0, 1]}),
]
expected = Dataset(
{"a": (("t", "x"), [[np.nan, 2, 3], [1, 2, np.nan]])}, {"x": [0, 1, 2]}
)
actual = combine_nested(datasets, concat_dim="t")
assert_identical(expected, actual)
def test_invalid_hypercube_input(self):
ds = create_test_data
datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]]
with pytest.raises(
ValueError, match=r"sub-lists do not have consistent lengths"
):
combine_nested(datasets, concat_dim=["dim1", "dim2"])
datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]]
with pytest.raises(
ValueError, match=r"sub-lists do not have consistent depths"
):
combine_nested(datasets, concat_dim=["dim1", "dim2"])
datasets = [[ds(0), ds(1)], [ds(3), ds(4)]]
with pytest.raises(ValueError, match=r"concat_dims has length"):
combine_nested(datasets, concat_dim=["dim1"])
def test_merge_one_dim_concat_another(self):
objs = [
[Dataset({"foo": ("x", [0, 1])}), Dataset({"bar": ("x", [10, 20])})],
[Dataset({"foo": ("x", [2, 3])}), Dataset({"bar": ("x", [30, 40])})],
]
expected = Dataset({"foo": ("x", [0, 1, 2, 3]), "bar": ("x", [10, 20, 30, 40])})
actual = combine_nested(objs, concat_dim=["x", None], compat="equals")
assert_identical(expected, actual)
# Proving it works symmetrically
objs = [
[Dataset({"foo": ("x", [0, 1])}), Dataset({"foo": ("x", [2, 3])})],
[Dataset({"bar": ("x", [10, 20])}), Dataset({"bar": ("x", [30, 40])})],
]
actual = combine_nested(objs, concat_dim=[None, "x"], compat="equals")
assert_identical(expected, actual)
def test_combine_concat_over_redundant_nesting(self):
objs = [[Dataset({"x": [0]}), Dataset({"x": [1]})]]
actual = combine_nested(objs, concat_dim=[None, "x"])
expected = Dataset({"x": [0, 1]})
assert_identical(expected, actual)
objs = [[Dataset({"x": [0]})], [Dataset({"x": [1]})]]
actual = combine_nested(objs, concat_dim=["x", None])
expected = Dataset({"x": [0, 1]})
assert_identical(expected, actual)
objs = [[Dataset({"x": [0]})]]
actual = combine_nested(objs, concat_dim=[None, None])
expected = Dataset({"x": [0]})
assert_identical(expected, actual)
@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"a": 2, "b": 1}])
def test_combine_nested_fill_value(self, fill_value):
datasets = [
Dataset({"a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1]}),
]
if fill_value == dtypes.NA:
# if we supply the default, we expect the missing value for a
# float array
fill_value_a = fill_value_b = np.nan
elif isinstance(fill_value, dict):
fill_value_a = fill_value["a"]
fill_value_b = fill_value["b"]
else:
fill_value_a = fill_value_b = fill_value
expected = Dataset(
{
"a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]),
"b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]),
},
{"x": [0, 1, 2]},
)
actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value)
assert_identical(expected, actual)
def test_combine_nested_unnamed_data_arrays(self):
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
actual = combine_nested([unnamed_array], concat_dim="x")
expected = unnamed_array
assert_identical(expected, actual)
unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
actual = combine_nested([unnamed_array1, unnamed_array2], concat_dim="x")
expected = DataArray(
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
)
assert_identical(expected, actual)
da1 = DataArray(data=[[0.0]], coords={"x": [0], "y": [0]}, dims=["x", "y"])
da2 = DataArray(data=[[1.0]], coords={"x": [0], "y": [1]}, dims=["x", "y"])
da3 = DataArray(data=[[2.0]], coords={"x": [1], "y": [0]}, dims=["x", "y"])
da4 = DataArray(data=[[3.0]], coords={"x": [1], "y": [1]}, dims=["x", "y"])
objs = [[da1, da2], [da3, da4]]
expected = DataArray(
data=[[0.0, 1.0], [2.0, 3.0]],
coords={"x": [0, 1], "y": [0, 1]},
dims=["x", "y"],
)
actual = combine_nested(objs, concat_dim=["x", "y"])
assert_identical(expected, actual)
# TODO aijams - Determine if this test is appropriate.
def test_nested_combine_mixed_datasets_arrays(self):
objs = [
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
Dataset({"x": [2, 3]}),
]
with pytest.raises(
ValueError, match=r"Can't combine datasets with unnamed arrays."
):
combine_nested(objs, "x")
class TestCombineDatasetsbyCoords:
def test_combine_by_coords(self):
objs = [Dataset({"x": [0]}), Dataset({"x": [1]})]
actual = combine_by_coords(objs)
expected = Dataset({"x": [0, 1]})
assert_identical(expected, actual)
actual = combine_by_coords([actual])
assert_identical(expected, actual)
objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2]})]
actual = combine_by_coords(objs)
expected = Dataset({"x": [0, 1, 2]})
assert_identical(expected, actual)
# ensure auto_combine handles non-sorted variables
objs = [
Dataset({"x": ("a", [0]), "y": ("a", [0]), "a": [0]}),
Dataset({"x": ("a", [1]), "y": ("a", [1]), "a": [1]}),
]
actual = combine_by_coords(objs)
expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1]), "a": [0, 1]})
assert_identical(expected, actual)
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})]
actual = combine_by_coords(objs)
expected = Dataset({"x": [0, 1], "y": [0, 1]})
assert_equal(actual, expected)
objs = [Dataset({"x": 0}), Dataset({"x": 1})]
with pytest.raises(
ValueError, match=r"Could not find any dimension coordinates"
):
combine_by_coords(objs)
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
with pytest.raises(
ValueError,
match=r"Every dimension requires a corresponding 1D coordinate and index",
):
combine_by_coords(objs)
def test_empty_input(self):
assert_identical(Dataset(), combine_by_coords([]))
@pytest.mark.parametrize(
"join, expected",
[
("outer", Dataset({"x": [0, 1], "y": [0, 1]})),
("inner", Dataset({"x": [0, 1], "y": []})),
("left", Dataset({"x": [0, 1], "y": [0]})),
("right", Dataset({"x": [0, 1], "y": [1]})),
],
)
def test_combine_coords_join(self, join, expected):
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})]
actual = combine_nested(objs, concat_dim="x", join=join)
assert_identical(expected, actual)
def test_combine_coords_join_exact(self):
objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [1], "y": [1]})]
with pytest.raises(ValueError, match=r"cannot align.*join.*exact.*"):
combine_nested(objs, concat_dim="x", join="exact")
@pytest.mark.parametrize(
"combine_attrs, expected",
[
("drop", Dataset({"x": [0, 1], "y": [0, 1]}, attrs={})),
(
"no_conflicts",
Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "b": 2}),
),
("override", Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1})),
(
lambda attrs, context: attrs[1],
Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "b": 2}),
),
],
)
def test_combine_coords_combine_attrs(self, combine_attrs, expected):
objs = [
Dataset({"x": [0], "y": [0]}, attrs={"a": 1}),
Dataset({"x": [1], "y": [1]}, attrs={"a": 1, "b": 2}),
]
actual = combine_nested(
objs, concat_dim="x", join="outer", combine_attrs=combine_attrs
)
assert_identical(expected, actual)
if combine_attrs == "no_conflicts":
objs[1].attrs["a"] = 2
with pytest.raises(ValueError, match=r"combine_attrs='no_conflicts'"):
actual = combine_nested(
objs, concat_dim="x", join="outer", combine_attrs=combine_attrs
)
def test_combine_coords_combine_attrs_identical(self):
objs = [
Dataset({"x": [0], "y": [0]}, attrs={"a": 1}),
Dataset({"x": [1], "y": [1]}, attrs={"a": 1}),
]
expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1})
actual = combine_nested(
objs, concat_dim="x", join="outer", combine_attrs="identical"
)
assert_identical(expected, actual)
objs[1].attrs["b"] = 2
with pytest.raises(ValueError, match=r"combine_attrs='identical'"):
actual = combine_nested(
objs, concat_dim="x", join="outer", combine_attrs="identical"
)
def test_combine_nested_combine_attrs_drop_conflicts(self):
objs = [
Dataset({"x": [0], "y": [0]}, attrs={"a": 1, "b": 2, "c": 3}),
Dataset({"x": [1], "y": [1]}, attrs={"a": 1, "b": 0, "d": 3}),
]
expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "c": 3, "d": 3})
actual = combine_nested(
objs, concat_dim="x", join="outer", combine_attrs="drop_conflicts"
)
assert_identical(expected, actual)
@pytest.mark.parametrize(
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 1, "b": 2, "c": 3},
{"b": 1, "c": 3, "d": 4},
{"a": 1, "c": 3, "d": 4},
False,
),
],
)
def test_combine_nested_combine_attrs_variables(
self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
):
"""check that combine_attrs is used on data variables and coords"""
data1 = Dataset(
{
"a": ("x", [1, 2], attrs1),
"b": ("x", [3, -1], attrs1),
"x": ("x", [0, 1], attrs1),
}
)
data2 = Dataset(
{
"a": ("x", [2, 3], attrs2),
"b": ("x", [-2, 1], attrs2),
"x": ("x", [2, 3], attrs2),
}
)
if expect_exception:
with pytest.raises(MergeError, match="combine_attrs"):
combine_by_coords([data1, data2], combine_attrs=combine_attrs)
else:
actual = combine_by_coords([data1, data2], combine_attrs=combine_attrs)
expected = Dataset(
{
"a": ("x", [1, 2, 2, 3], expected_attrs),
"b": ("x", [3, -1, -2, 1], expected_attrs),
},
{"x": ("x", [0, 1, 2, 3], expected_attrs)},
)
assert_identical(actual, expected)
@pytest.mark.parametrize(
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 1, "b": 2, "c": 3},
{"b": 1, "c": 3, "d": 4},
{"a": 1, "c": 3, "d": 4},
False,
),
],
)
def test_combine_by_coords_combine_attrs_variables(
self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
):
"""check that combine_attrs is used on data variables and coords"""
data1 = Dataset(
{"x": ("a", [0], attrs1), "y": ("a", [0], attrs1), "a": ("a", [0], attrs1)}
)
data2 = Dataset(
{"x": ("a", [1], attrs2), "y": ("a", [1], attrs2), "a": ("a", [1], attrs2)}
)
if expect_exception:
with pytest.raises(MergeError, match="combine_attrs"):
combine_by_coords([data1, data2], combine_attrs=combine_attrs)
else:
actual = combine_by_coords([data1, data2], combine_attrs=combine_attrs)
expected = Dataset(
{
"x": ("a", [0, 1], expected_attrs),
"y": ("a", [0, 1], expected_attrs),
"a": ("a", [0, 1], expected_attrs),
}
)
assert_identical(actual, expected)
def test_infer_order_from_coords(self):
data = create_test_data()
objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))]
actual = combine_by_coords(objs)
expected = data
assert expected.broadcast_equals(actual)
def test_combine_leaving_bystander_dimensions(self):
# Check non-monotonic bystander dimension coord doesn't raise
# ValueError on combine (https://github.com/pydata/xarray/issues/3150)
ycoord = ["a", "c", "b"]
data = np.random.rand(7, 3)
ds1 = Dataset(
data_vars=dict(data=(["x", "y"], data[:3, :])),
coords=dict(x=[1, 2, 3], y=ycoord),
)
ds2 = Dataset(
data_vars=dict(data=(["x", "y"], data[3:, :])),
coords=dict(x=[4, 5, 6, 7], y=ycoord),
)
expected = Dataset(
data_vars=dict(data=(["x", "y"], data)),
coords=dict(x=[1, 2, 3, 4, 5, 6, 7], y=ycoord),
)
actual = combine_by_coords((ds1, ds2))
assert_identical(expected, actual)
def test_combine_by_coords_previously_failed(self):
# In the above scenario, one file is missing, containing the data for
# one year's data for one variable.
datasets = [
Dataset({"a": ("x", [0]), "x": [0]}),
Dataset({"b": ("x", [0]), "x": [0]}),
Dataset({"a": ("x", [1]), "x": [1]}),
]
expected = Dataset({"a": ("x", [0, 1]), "b": ("x", [0, np.nan])}, {"x": [0, 1]})
actual = combine_by_coords(datasets)
assert_identical(expected, actual)
def test_combine_by_coords_still_fails(self):
# concat can't handle new variables (yet):
# https://github.com/pydata/xarray/issues/508
datasets = [Dataset({"x": 0}, {"y": 0}), Dataset({"x": 1}, {"y": 1, "z": 1})]
with pytest.raises(ValueError):
combine_by_coords(datasets, "y")
def test_combine_by_coords_no_concat(self):
objs = [Dataset({"x": 0}), Dataset({"y": 1})]
actual = combine_by_coords(objs)
expected = Dataset({"x": 0, "y": 1})
assert_identical(expected, actual)
objs = [Dataset({"x": 0, "y": 1}), Dataset({"y": np.nan, "z": 2})]
actual = combine_by_coords(objs)
expected = Dataset({"x": 0, "y": 1, "z": 2})
assert_identical(expected, actual)
def test_check_for_impossible_ordering(self):
ds0 = Dataset({"x": [0, 1, 5]})
ds1 = Dataset({"x": [2, 3]})
with pytest.raises(
ValueError,
match=r"does not have monotonic global indexes along dimension x",
):
combine_by_coords([ds1, ds0])
def test_combine_by_coords_incomplete_hypercube(self):
# test that this succeeds with default fill_value
x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]})
x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]})
x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]})
actual = combine_by_coords([x1, x2, x3])
expected = Dataset(
{"a": (("y", "x"), [[1, 1], [1, np.nan]])},
coords={"y": [0, 1], "x": [0, 1]},
)
assert_identical(expected, actual)
# test that this fails if fill_value is None
with pytest.raises(ValueError):
combine_by_coords([x1, x2, x3], fill_value=None)
def test_combine_by_coords_override_order(self) -> None:
# regression test for https://github.com/pydata/xarray/issues/8828
x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]})
x2 = Dataset(
{"a": (("y", "x"), [[2]]), "b": (("y", "x"), [[1]])},
coords={"y": [0], "x": [0]},
)
actual = combine_by_coords([x1, x2], compat="override")
assert_equal(actual["a"], actual["b"])
assert_equal(actual["a"], x1["a"])
actual = combine_by_coords([x2, x1], compat="override")
assert_equal(actual["a"], x2["a"])
class TestCombineMixedObjectsbyCoords:
def test_combine_by_coords_mixed_unnamed_dataarrays(self):
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_da = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
with pytest.raises(
ValueError, match="Can't automatically combine unnamed DataArrays with"
):
combine_by_coords([named_da, unnamed_da])
da = DataArray([0, 1], dims="x", coords=({"x": [0, 1]}))
ds = Dataset({"x": [2, 3]})
with pytest.raises(
ValueError,
match="Can't automatically combine unnamed DataArrays with",
):
combine_by_coords([da, ds])
def test_combine_coords_mixed_datasets_named_dataarrays(self):
da = DataArray(name="a", data=[4, 5], dims="x", coords=({"x": [0, 1]}))
ds = Dataset({"b": ("x", [2, 3])})
actual = combine_by_coords([da, ds])
expected = Dataset(
{"a": ("x", [4, 5]), "b": ("x", [2, 3])}, coords={"x": ("x", [0, 1])}
)
assert_identical(expected, actual)
def test_combine_by_coords_all_unnamed_dataarrays(self):
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
actual = combine_by_coords([unnamed_array])
expected = unnamed_array
assert_identical(expected, actual)
unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
actual = combine_by_coords([unnamed_array1, unnamed_array2])
expected = DataArray(
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
)
assert_identical(expected, actual)
def test_combine_by_coords_all_named_dataarrays(self):
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
actual = combine_by_coords([named_da])
expected = named_da.to_dataset()
assert_identical(expected, actual)
named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
actual = combine_by_coords([named_da1, named_da2])
expected = Dataset(
{
"a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"),
"b": DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"),
}
)
assert_identical(expected, actual)
def test_combine_by_coords_all_dataarrays_with_the_same_name(self):
named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
named_da2 = DataArray(name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
actual = combine_by_coords([named_da1, named_da2])
expected = merge([named_da1, named_da2])
assert_identical(expected, actual)
@requires_cftime
def test_combine_by_coords_distant_cftime_dates():
# Regression test for https://github.com/pydata/xarray/issues/3535
import cftime
time_1 = [cftime.DatetimeGregorian(4500, 12, 31)]
time_2 = [cftime.DatetimeGregorian(4600, 12, 31)]
time_3 = [cftime.DatetimeGregorian(5100, 12, 31)]
da_1 = DataArray([0], dims=["time"], coords=[time_1], name="a").to_dataset()
da_2 = DataArray([1], dims=["time"], coords=[time_2], name="a").to_dataset()
da_3 = DataArray([2], dims=["time"], coords=[time_3], name="a").to_dataset()
result = combine_by_coords([da_1, da_2, da_3])
expected_time = np.concatenate([time_1, time_2, time_3])
expected = DataArray(
[0, 1, 2], dims=["time"], coords=[expected_time], name="a"
).to_dataset()
assert_identical(result, expected)
@requires_cftime
def test_combine_by_coords_raises_for_differing_calendars():
# previously failed with uninformative StopIteration instead of TypeError
# https://github.com/pydata/xarray/issues/4495
import cftime
time_1 = [cftime.DatetimeGregorian(2000, 1, 1)]
time_2 = [cftime.DatetimeProlepticGregorian(2001, 1, 1)]
da_1 = DataArray([0], dims=["time"], coords=[time_1], name="a").to_dataset()
da_2 = DataArray([1], dims=["time"], coords=[time_2], name="a").to_dataset()
error_msg = (
"Cannot combine along dimension 'time' with mixed types."
" Found:.*"
" If importing data directly from a file then setting"
" `use_cftime=True` may fix this issue."
)
with pytest.raises(TypeError, match=error_msg):
combine_by_coords([da_1, da_2])
def test_combine_by_coords_raises_for_differing_types():
# str and byte cannot be compared
da_1 = DataArray([0], dims=["time"], coords=[["a"]], name="a").to_dataset()
da_2 = DataArray([1], dims=["time"], coords=[[b"b"]], name="a").to_dataset()
with pytest.raises(
TypeError, match=r"Cannot combine along dimension 'time' with mixed types."
):
combine_by_coords([da_1, da_2])