3266 lines
113 KiB
Python
3266 lines
113 KiB
Python
from __future__ import annotations
|
|
|
|
import datetime
|
|
import operator
|
|
import warnings
|
|
from itertools import pairwise
|
|
from typing import Literal
|
|
from unittest import mock
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
from packaging.version import Version
|
|
|
|
import xarray as xr
|
|
from xarray import DataArray, Dataset, Variable
|
|
from xarray.core.alignment import broadcast
|
|
from xarray.core.groupby import _consolidate_slices
|
|
from xarray.core.types import InterpOptions, ResampleCompatible
|
|
from xarray.groupers import (
|
|
BinGrouper,
|
|
EncodedGroups,
|
|
Grouper,
|
|
TimeResampler,
|
|
UniqueGrouper,
|
|
)
|
|
from xarray.namedarray.pycompat import is_chunked_array
|
|
from xarray.tests import (
|
|
InaccessibleArray,
|
|
assert_allclose,
|
|
assert_equal,
|
|
assert_identical,
|
|
create_test_data,
|
|
has_cftime,
|
|
has_dask,
|
|
has_dask_ge_2024_08_1,
|
|
has_flox,
|
|
has_pandas_ge_2_2,
|
|
raise_if_dask_computes,
|
|
requires_cftime,
|
|
requires_dask,
|
|
requires_dask_ge_2024_08_1,
|
|
requires_flox,
|
|
requires_flox_0_9_12,
|
|
requires_pandas_ge_2_2,
|
|
requires_scipy,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def dataset() -> xr.Dataset:
|
|
ds = xr.Dataset(
|
|
{
|
|
"foo": (("x", "y", "z"), np.random.randn(3, 4, 2)),
|
|
"baz": ("x", ["e", "f", "g"]),
|
|
"cat": ("y", pd.Categorical(["cat1", "cat2", "cat2", "cat1"])),
|
|
},
|
|
{"x": ("x", ["a", "b", "c"], {"name": "x"}), "y": [1, 2, 3, 4], "z": [1, 2]},
|
|
)
|
|
ds["boo"] = (("z", "y"), [["f", "g", "h", "j"]] * 2)
|
|
|
|
return ds
|
|
|
|
|
|
@pytest.fixture
|
|
def array(dataset) -> xr.DataArray:
|
|
return dataset["foo"]
|
|
|
|
|
|
def test_consolidate_slices() -> None:
|
|
assert _consolidate_slices([slice(3), slice(3, 5)]) == [slice(5)]
|
|
assert _consolidate_slices([slice(2, 3), slice(3, 6)]) == [slice(2, 6)]
|
|
assert _consolidate_slices([slice(2, 3, 1), slice(3, 6, 1)]) == [slice(2, 6, 1)]
|
|
|
|
slices = [slice(2, 3), slice(5, 6)]
|
|
assert _consolidate_slices(slices) == slices
|
|
|
|
# ignore type because we're checking for an error anyway
|
|
with pytest.raises(ValueError):
|
|
_consolidate_slices([slice(3), 4]) # type: ignore[list-item]
|
|
|
|
|
|
@pytest.mark.filterwarnings("ignore:return type")
|
|
def test_groupby_dims_property(dataset) -> None:
|
|
with pytest.warns(FutureWarning, match="The return type of"):
|
|
assert dataset.groupby("x").dims == dataset.isel(x=[1]).dims
|
|
with pytest.warns(FutureWarning, match="The return type of"):
|
|
assert dataset.groupby("y").dims == dataset.isel(y=[1]).dims
|
|
|
|
assert tuple(dataset.groupby("x").dims) == tuple(dataset.isel(x=slice(1, 2)).dims)
|
|
assert tuple(dataset.groupby("y").dims) == tuple(dataset.isel(y=slice(1, 2)).dims)
|
|
|
|
dataset = dataset.drop_vars(["cat"])
|
|
stacked = dataset.stack({"xy": ("x", "y")})
|
|
assert tuple(stacked.groupby("xy").dims) == tuple(stacked.isel(xy=[0]).dims)
|
|
|
|
|
|
def test_groupby_sizes_property(dataset) -> None:
|
|
assert dataset.groupby("x").sizes == dataset.isel(x=[1]).sizes
|
|
assert dataset.groupby("y").sizes == dataset.isel(y=[1]).sizes
|
|
dataset = dataset.drop_vars("cat")
|
|
stacked = dataset.stack({"xy": ("x", "y")})
|
|
assert stacked.groupby("xy").sizes == stacked.isel(xy=[0]).sizes
|
|
|
|
|
|
def test_multi_index_groupby_map(dataset) -> None:
|
|
# regression test for GH873
|
|
ds = dataset.isel(z=1, drop=True)[["foo"]]
|
|
expected = 2 * ds
|
|
actual = (
|
|
ds.stack(space=["x", "y"])
|
|
.groupby("space")
|
|
.map(lambda x: 2 * x)
|
|
.unstack("space")
|
|
)
|
|
assert_equal(expected, actual)
|
|
|
|
|
|
@pytest.mark.parametrize("grouper", [dict(group="x"), dict(x=UniqueGrouper())])
|
|
def test_reduce_numeric_only(dataset, grouper: dict) -> None:
|
|
gb = dataset.groupby(**grouper)
|
|
with xr.set_options(use_flox=False):
|
|
expected = gb.sum()
|
|
with xr.set_options(use_flox=True):
|
|
actual = gb.sum()
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_multi_index_groupby_sum() -> None:
|
|
# regression test for GH873
|
|
ds = xr.Dataset(
|
|
{"foo": (("x", "y", "z"), np.ones((3, 4, 2)))},
|
|
{"x": ["a", "b", "c"], "y": [1, 2, 3, 4]},
|
|
)
|
|
expected = ds.sum("z")
|
|
actual = ds.stack(space=["x", "y"]).groupby("space").sum("z").unstack("space")
|
|
assert_equal(expected, actual)
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
actual = (
|
|
ds.stack(space=["x", "y"])
|
|
.groupby(space=UniqueGrouper(), z=UniqueGrouper())
|
|
.sum("z")
|
|
.unstack("space")
|
|
)
|
|
assert_equal(expected, ds)
|
|
|
|
if not has_pandas_ge_2_2:
|
|
# the next line triggers a mysterious multiindex error on pandas 2.0
|
|
return
|
|
|
|
actual = ds.stack(space=["x", "y"]).groupby("space").sum(...).unstack("space")
|
|
assert_equal(expected, actual)
|
|
|
|
|
|
@requires_pandas_ge_2_2
|
|
def test_multi_index_propagation():
|
|
# regression test for GH9648
|
|
times = pd.date_range("2023-01-01", periods=4)
|
|
locations = ["A", "B"]
|
|
data = [[0.5, 0.7], [0.6, 0.5], [0.4, 0.6], [0.4, 0.9]]
|
|
|
|
da = xr.DataArray(
|
|
data, dims=["time", "location"], coords={"time": times, "location": locations}
|
|
)
|
|
da = da.stack(multiindex=["time", "location"])
|
|
grouped = da.groupby("multiindex")
|
|
|
|
with xr.set_options(use_flox=True):
|
|
actual = grouped.sum()
|
|
with xr.set_options(use_flox=False):
|
|
expected = grouped.first()
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_groupby_da_datetime() -> None:
|
|
# test groupby with a DataArray of dtype datetime for GH1132
|
|
# create test data
|
|
times = pd.date_range("2000-01-01", periods=4)
|
|
foo = xr.DataArray([1, 2, 3, 4], coords=dict(time=times), dims="time")
|
|
# create test index
|
|
reference_dates = [times[0], times[2]]
|
|
labels = reference_dates[0:1] * 2 + reference_dates[1:2] * 2
|
|
ind = xr.DataArray(
|
|
labels, coords=dict(time=times), dims="time", name="reference_date"
|
|
)
|
|
g = foo.groupby(ind)
|
|
actual = g.sum(dim="time")
|
|
expected = xr.DataArray(
|
|
[3, 7], coords=dict(reference_date=reference_dates), dims="reference_date"
|
|
)
|
|
assert_equal(expected, actual)
|
|
|
|
|
|
def test_groupby_duplicate_coordinate_labels() -> None:
|
|
# fix for https://stackoverflow.com/questions/38065129
|
|
array = xr.DataArray([1, 2, 3], [("x", [1, 1, 2])])
|
|
expected = xr.DataArray([3, 3], [("x", [1, 2])])
|
|
actual = array.groupby("x").sum()
|
|
assert_equal(expected, actual)
|
|
|
|
|
|
def test_groupby_input_mutation() -> None:
|
|
# regression test for GH2153
|
|
array = xr.DataArray([1, 2, 3], [("x", [2, 2, 1])])
|
|
array_copy = array.copy()
|
|
expected = xr.DataArray([3, 3], [("x", [1, 2])])
|
|
actual = array.groupby("x").sum()
|
|
assert_identical(expected, actual)
|
|
assert_identical(array, array_copy) # should not modify inputs
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_groupby_indexvariable(use_flox: bool) -> None:
|
|
# regression test for GH7919
|
|
array = xr.DataArray([1, 2, 3], [("x", [2, 2, 1])])
|
|
iv = xr.IndexVariable(dims="x", data=pd.Index(array.x.values))
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = array.groupby(iv).sum()
|
|
actual = array.groupby(iv).sum()
|
|
expected = xr.DataArray([3, 3], [("x", [1, 2])])
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"obj",
|
|
[
|
|
xr.DataArray([1, 2, 3, 4, 5, 6], [("x", [1, 1, 1, 2, 2, 2])]),
|
|
xr.Dataset({"foo": ("x", [1, 2, 3, 4, 5, 6])}, {"x": [1, 1, 1, 2, 2, 2]}),
|
|
],
|
|
)
|
|
def test_groupby_map_shrink_groups(obj) -> None:
|
|
expected = obj.isel(x=[0, 1, 3, 4])
|
|
actual = obj.groupby("x").map(lambda f: f.isel(x=[0, 1]))
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"obj",
|
|
[
|
|
xr.DataArray([1, 2, 3], [("x", [1, 2, 2])]),
|
|
xr.Dataset({"foo": ("x", [1, 2, 3])}, {"x": [1, 2, 2]}),
|
|
],
|
|
)
|
|
def test_groupby_map_change_group_size(obj) -> None:
|
|
def func(group):
|
|
if group.sizes["x"] == 1:
|
|
result = group.isel(x=[0, 0])
|
|
else:
|
|
result = group.isel(x=[0])
|
|
return result
|
|
|
|
expected = obj.isel(x=[0, 0, 1])
|
|
actual = obj.groupby("x").map(func)
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_da_groupby_map_func_args() -> None:
|
|
def func(arg1, arg2, arg3=0):
|
|
return arg1 + arg2 + arg3
|
|
|
|
array = xr.DataArray([1, 1, 1], [("x", [1, 2, 3])])
|
|
expected = xr.DataArray([3, 3, 3], [("x", [1, 2, 3])])
|
|
actual = array.groupby("x").map(func, args=(1,), arg3=1)
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_ds_groupby_map_func_args() -> None:
|
|
def func(arg1, arg2, arg3=0):
|
|
return arg1 + arg2 + arg3
|
|
|
|
dataset = xr.Dataset({"foo": ("x", [1, 1, 1])}, {"x": [1, 2, 3]})
|
|
expected = xr.Dataset({"foo": ("x", [3, 3, 3])}, {"x": [1, 2, 3]})
|
|
actual = dataset.groupby("x").map(func, args=(1,), arg3=1)
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_da_groupby_empty() -> None:
|
|
empty_array = xr.DataArray([], dims="dim")
|
|
|
|
with pytest.raises(ValueError):
|
|
empty_array.groupby("dim")
|
|
|
|
|
|
@requires_dask
|
|
def test_dask_da_groupby_quantile() -> None:
|
|
# Scalar quantile
|
|
expected = xr.DataArray(
|
|
data=[2, 5], coords={"x": [1, 2], "quantile": 0.5}, dims="x"
|
|
)
|
|
array = xr.DataArray(
|
|
data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x"
|
|
)
|
|
|
|
# will work blockwise with flox
|
|
actual = array.chunk(x=3).groupby("x").quantile(0.5)
|
|
assert_identical(expected, actual)
|
|
|
|
# will work blockwise with flox
|
|
actual = array.chunk(x=-1).groupby("x").quantile(0.5)
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
@requires_dask
|
|
def test_dask_da_groupby_median() -> None:
|
|
expected = xr.DataArray(data=[2, 5], coords={"x": [1, 2]}, dims="x")
|
|
array = xr.DataArray(
|
|
data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x"
|
|
)
|
|
with xr.set_options(use_flox=False):
|
|
actual = array.chunk(x=1).groupby("x").median()
|
|
assert_identical(expected, actual)
|
|
|
|
with xr.set_options(use_flox=True):
|
|
actual = array.chunk(x=1).groupby("x").median()
|
|
assert_identical(expected, actual)
|
|
|
|
# will work blockwise with flox
|
|
actual = array.chunk(x=3).groupby("x").median()
|
|
assert_identical(expected, actual)
|
|
|
|
# will work blockwise with flox
|
|
actual = array.chunk(x=-1).groupby("x").median()
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [pytest.param(True, marks=requires_flox), False])
|
|
def test_da_groupby_quantile(use_flox: bool) -> None:
|
|
array = xr.DataArray(
|
|
data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x"
|
|
)
|
|
|
|
# Scalar quantile
|
|
expected = xr.DataArray(
|
|
data=[2, 5], coords={"x": [1, 2], "quantile": 0.5}, dims="x"
|
|
)
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = array.groupby("x").quantile(0.5)
|
|
assert_identical(expected, actual)
|
|
|
|
# Vector quantile
|
|
expected = xr.DataArray(
|
|
data=[[1, 3], [4, 6]],
|
|
coords={"x": [1, 2], "quantile": [0, 1]},
|
|
dims=("x", "quantile"),
|
|
)
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = array.groupby("x").quantile([0, 1])
|
|
assert_identical(expected, actual)
|
|
|
|
array = xr.DataArray(
|
|
data=[np.nan, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x"
|
|
)
|
|
|
|
for skipna in (True, False, None):
|
|
e = [np.nan, 5] if skipna is False else [2.5, 5]
|
|
|
|
expected = xr.DataArray(data=e, coords={"x": [1, 2], "quantile": 0.5}, dims="x")
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = array.groupby("x").quantile(0.5, skipna=skipna)
|
|
assert_identical(expected, actual)
|
|
|
|
# Multiple dimensions
|
|
array = xr.DataArray(
|
|
data=[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
|
|
coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]},
|
|
dims=("x", "y"),
|
|
)
|
|
|
|
actual_x = array.groupby("x").quantile(0, dim=...)
|
|
expected_x = xr.DataArray(
|
|
data=[1, 4], coords={"x": [1, 2], "quantile": 0}, dims="x"
|
|
)
|
|
assert_identical(expected_x, actual_x)
|
|
|
|
actual_y = array.groupby("y").quantile(0, dim=...)
|
|
expected_y = xr.DataArray(
|
|
data=[1, 22], coords={"y": [0, 1], "quantile": 0}, dims="y"
|
|
)
|
|
assert_identical(expected_y, actual_y)
|
|
|
|
actual_xx = array.groupby("x").quantile(0)
|
|
expected_xx = xr.DataArray(
|
|
data=[[1, 11, 22], [4, 15, 24]],
|
|
coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0},
|
|
dims=("x", "y"),
|
|
)
|
|
assert_identical(expected_xx, actual_xx)
|
|
|
|
actual_yy = array.groupby("y").quantile(0)
|
|
expected_yy = xr.DataArray(
|
|
data=[[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]],
|
|
coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0},
|
|
dims=("x", "y"),
|
|
)
|
|
assert_identical(expected_yy, actual_yy)
|
|
|
|
times = pd.date_range("2000-01-01", periods=365)
|
|
x = [0, 1]
|
|
foo = xr.DataArray(
|
|
np.reshape(np.arange(365 * 2), (365, 2)),
|
|
coords={"time": times, "x": x},
|
|
dims=("time", "x"),
|
|
)
|
|
g = foo.groupby(foo.time.dt.month)
|
|
|
|
actual = g.quantile(0, dim=...)
|
|
expected = xr.DataArray(
|
|
data=[
|
|
0.0,
|
|
62.0,
|
|
120.0,
|
|
182.0,
|
|
242.0,
|
|
304.0,
|
|
364.0,
|
|
426.0,
|
|
488.0,
|
|
548.0,
|
|
610.0,
|
|
670.0,
|
|
],
|
|
coords={"month": np.arange(1, 13), "quantile": 0},
|
|
dims="month",
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
actual = g.quantile(0, dim="time")[:2]
|
|
expected = xr.DataArray(
|
|
data=[[0.0, 1], [62.0, 63]],
|
|
coords={"month": [1, 2], "x": [0, 1], "quantile": 0},
|
|
dims=("month", "x"),
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
# method keyword
|
|
array = xr.DataArray(data=[1, 2, 3, 4], coords={"x": [1, 1, 2, 2]}, dims="x")
|
|
|
|
expected = xr.DataArray(
|
|
data=[1, 3], coords={"x": [1, 2], "quantile": 0.5}, dims="x"
|
|
)
|
|
actual = array.groupby("x").quantile(0.5, method="lower")
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_ds_groupby_quantile() -> None:
|
|
ds = xr.Dataset(
|
|
data_vars={"a": ("x", [1, 2, 3, 4, 5, 6])}, coords={"x": [1, 1, 1, 2, 2, 2]}
|
|
)
|
|
|
|
# Scalar quantile
|
|
expected = xr.Dataset(
|
|
data_vars={"a": ("x", [2, 5])}, coords={"quantile": 0.5, "x": [1, 2]}
|
|
)
|
|
actual = ds.groupby("x").quantile(0.5)
|
|
assert_identical(expected, actual)
|
|
|
|
# Vector quantile
|
|
expected = xr.Dataset(
|
|
data_vars={"a": (("x", "quantile"), [[1, 3], [4, 6]])},
|
|
coords={"x": [1, 2], "quantile": [0, 1]},
|
|
)
|
|
actual = ds.groupby("x").quantile([0, 1])
|
|
assert_identical(expected, actual)
|
|
|
|
ds = xr.Dataset(
|
|
data_vars={"a": ("x", [np.nan, 2, 3, 4, 5, 6])},
|
|
coords={"x": [1, 1, 1, 2, 2, 2]},
|
|
)
|
|
|
|
for skipna in (True, False, None):
|
|
e = [np.nan, 5] if skipna is False else [2.5, 5]
|
|
|
|
expected = xr.Dataset(
|
|
data_vars={"a": ("x", e)}, coords={"quantile": 0.5, "x": [1, 2]}
|
|
)
|
|
actual = ds.groupby("x").quantile(0.5, skipna=skipna)
|
|
assert_identical(expected, actual)
|
|
|
|
# Multiple dimensions
|
|
ds = xr.Dataset(
|
|
data_vars={
|
|
"a": (
|
|
("x", "y"),
|
|
[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
|
|
)
|
|
},
|
|
coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]},
|
|
)
|
|
|
|
actual_x = ds.groupby("x").quantile(0, dim=...)
|
|
expected_x = xr.Dataset({"a": ("x", [1, 4])}, coords={"x": [1, 2], "quantile": 0})
|
|
assert_identical(expected_x, actual_x)
|
|
|
|
actual_y = ds.groupby("y").quantile(0, dim=...)
|
|
expected_y = xr.Dataset({"a": ("y", [1, 22])}, coords={"y": [0, 1], "quantile": 0})
|
|
assert_identical(expected_y, actual_y)
|
|
|
|
actual_xx = ds.groupby("x").quantile(0)
|
|
expected_xx = xr.Dataset(
|
|
{"a": (("x", "y"), [[1, 11, 22], [4, 15, 24]])},
|
|
coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0},
|
|
)
|
|
assert_identical(expected_xx, actual_xx)
|
|
|
|
actual_yy = ds.groupby("y").quantile(0)
|
|
expected_yy = xr.Dataset(
|
|
{"a": (("x", "y"), [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]])},
|
|
coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0},
|
|
).transpose()
|
|
assert_identical(expected_yy, actual_yy)
|
|
|
|
times = pd.date_range("2000-01-01", periods=365)
|
|
x = [0, 1]
|
|
foo = xr.Dataset(
|
|
{"a": (("time", "x"), np.reshape(np.arange(365 * 2), (365, 2)))},
|
|
coords=dict(time=times, x=x),
|
|
)
|
|
g = foo.groupby(foo.time.dt.month)
|
|
|
|
actual = g.quantile(0, dim=...)
|
|
expected = xr.Dataset(
|
|
{
|
|
"a": (
|
|
"month",
|
|
[
|
|
0.0,
|
|
62.0,
|
|
120.0,
|
|
182.0,
|
|
242.0,
|
|
304.0,
|
|
364.0,
|
|
426.0,
|
|
488.0,
|
|
548.0,
|
|
610.0,
|
|
670.0,
|
|
],
|
|
)
|
|
},
|
|
coords={"month": np.arange(1, 13), "quantile": 0},
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
actual = g.quantile(0, dim="time").isel(month=slice(None, 2))
|
|
expected = xr.Dataset(
|
|
data_vars={"a": (("month", "x"), [[0.0, 1], [62.0, 63]])},
|
|
coords={"month": [1, 2], "x": [0, 1], "quantile": 0},
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
ds = xr.Dataset(data_vars={"a": ("x", [1, 2, 3, 4])}, coords={"x": [1, 1, 2, 2]})
|
|
|
|
# method keyword
|
|
expected = xr.Dataset(
|
|
data_vars={"a": ("x", [1, 3])}, coords={"quantile": 0.5, "x": [1, 2]}
|
|
)
|
|
actual = ds.groupby("x").quantile(0.5, method="lower")
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
@pytest.mark.parametrize("as_dataset", [False, True])
|
|
def test_groupby_quantile_interpolation_deprecated(as_dataset: bool) -> None:
|
|
array = xr.DataArray(data=[1, 2, 3, 4], coords={"x": [1, 1, 2, 2]}, dims="x")
|
|
|
|
arr: xr.DataArray | xr.Dataset
|
|
arr = array.to_dataset(name="name") if as_dataset else array
|
|
|
|
with pytest.warns(
|
|
FutureWarning,
|
|
match="`interpolation` argument to quantile was renamed to `method`",
|
|
):
|
|
actual = arr.quantile(0.5, interpolation="lower")
|
|
|
|
expected = arr.quantile(0.5, method="lower")
|
|
|
|
assert_identical(actual, expected)
|
|
|
|
with warnings.catch_warnings(record=True):
|
|
with pytest.raises(TypeError, match="interpolation and method keywords"):
|
|
arr.quantile(0.5, method="lower", interpolation="lower")
|
|
|
|
|
|
def test_da_groupby_assign_coords() -> None:
|
|
actual = xr.DataArray(
|
|
[[3, 4, 5], [6, 7, 8]], dims=["y", "x"], coords={"y": range(2), "x": range(3)}
|
|
)
|
|
actual1 = actual.groupby("x").assign_coords({"y": [-1, -2]})
|
|
actual2 = actual.groupby("x").assign_coords(y=[-1, -2])
|
|
expected = xr.DataArray(
|
|
[[3, 4, 5], [6, 7, 8]], dims=["y", "x"], coords={"y": [-1, -2], "x": range(3)}
|
|
)
|
|
assert_identical(expected, actual1)
|
|
assert_identical(expected, actual2)
|
|
|
|
|
|
repr_da = xr.DataArray(
|
|
np.random.randn(10, 20, 6, 24),
|
|
dims=["x", "y", "z", "t"],
|
|
coords={
|
|
"z": ["a", "b", "c", "a", "b", "c"],
|
|
"x": [1, 1, 1, 2, 2, 3, 4, 5, 3, 4],
|
|
"t": xr.date_range("2001-01-01", freq="ME", periods=24, use_cftime=False),
|
|
"month": ("t", list(range(1, 13)) * 2),
|
|
},
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("dim", ["x", "y", "z", "month"])
|
|
@pytest.mark.parametrize("obj", [repr_da, repr_da.to_dataset(name="a")])
|
|
def test_groupby_repr(obj, dim) -> None:
|
|
actual = repr(obj.groupby(dim))
|
|
N = len(np.unique(obj[dim]))
|
|
expected = f"<{obj.__class__.__name__}GroupBy"
|
|
expected += f", grouped over 1 grouper(s), {N} groups in total:"
|
|
expected += f"\n {dim!r}: {N}/{N} groups present with labels "
|
|
if dim == "x":
|
|
expected += "1, 2, 3, 4, 5>"
|
|
elif dim == "y":
|
|
expected += "0, 1, 2, 3, 4, 5, ..., 15, 16, 17, 18, 19>"
|
|
elif dim == "z":
|
|
expected += "'a', 'b', 'c'>"
|
|
elif dim == "month":
|
|
expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>"
|
|
assert actual == expected
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [repr_da, repr_da.to_dataset(name="a")])
|
|
def test_groupby_repr_datetime(obj) -> None:
|
|
actual = repr(obj.groupby("t.month"))
|
|
expected = f"<{obj.__class__.__name__}GroupBy"
|
|
expected += ", grouped over 1 grouper(s), 12 groups in total:\n"
|
|
expected += " 'month': 12/12 groups present with labels "
|
|
expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>"
|
|
assert actual == expected
|
|
|
|
|
|
@pytest.mark.filterwarnings("ignore:No index created for dimension id:UserWarning")
|
|
@pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning")
|
|
@pytest.mark.parametrize("shuffle", [True, False])
|
|
@pytest.mark.parametrize(
|
|
"chunk",
|
|
[
|
|
pytest.param(
|
|
dict(lat=1), marks=pytest.mark.skipif(not has_dask, reason="no dask")
|
|
),
|
|
pytest.param(
|
|
dict(lat=2, lon=2), marks=pytest.mark.skipif(not has_dask, reason="no dask")
|
|
),
|
|
False,
|
|
],
|
|
)
|
|
def test_groupby_drops_nans(shuffle: bool, chunk: Literal[False] | dict) -> None:
|
|
if shuffle and chunk and not has_dask_ge_2024_08_1:
|
|
pytest.skip()
|
|
# GH2383
|
|
# nan in 2D data variable (requires stacking)
|
|
ds = xr.Dataset(
|
|
{
|
|
"variable": (("lat", "lon", "time"), np.arange(60.0).reshape((4, 3, 5))),
|
|
"id": (("lat", "lon"), np.arange(12.0).reshape((4, 3))),
|
|
},
|
|
coords={"lat": np.arange(4), "lon": np.arange(3), "time": np.arange(5)},
|
|
)
|
|
|
|
ds["id"].values[0, 0] = np.nan
|
|
ds["id"].values[3, 0] = np.nan
|
|
ds["id"].values[-1, -1] = np.nan
|
|
|
|
if chunk:
|
|
ds["variable"] = ds["variable"].chunk(chunk)
|
|
grouped = ds.groupby(ds.id)
|
|
if shuffle:
|
|
grouped = grouped.shuffle_to_chunks().groupby(ds.id)
|
|
|
|
# non reduction operation
|
|
expected1 = ds.copy()
|
|
expected1.variable.data[0, 0, :] = np.nan
|
|
expected1.variable.data[-1, -1, :] = np.nan
|
|
expected1.variable.data[3, 0, :] = np.nan
|
|
actual1 = grouped.map(lambda x: x).transpose(*ds.variable.dims)
|
|
assert_identical(actual1, expected1)
|
|
|
|
# reduction along grouped dimension
|
|
actual2 = grouped.mean()
|
|
stacked = ds.stack({"xy": ["lat", "lon"]})
|
|
expected2 = (
|
|
stacked.variable.where(stacked.id.notnull())
|
|
.rename({"xy": "id"})
|
|
.to_dataset()
|
|
.reset_index("id", drop=True)
|
|
.assign(id=stacked.id.values)
|
|
.dropna("id")
|
|
.transpose(*actual2.variable.dims)
|
|
)
|
|
assert_identical(actual2, expected2)
|
|
|
|
# reduction operation along a different dimension
|
|
actual3 = grouped.mean("time")
|
|
expected3 = ds.mean("time").where(ds.id.notnull())
|
|
assert_identical(actual3, expected3)
|
|
|
|
# NaN in non-dimensional coordinate
|
|
array = xr.DataArray([1, 2, 3], [("x", [1, 2, 3])])
|
|
array["x1"] = ("x", [1, 1, np.nan])
|
|
expected4 = xr.DataArray(3, [("x1", [1])])
|
|
actual4 = array.groupby("x1").sum()
|
|
assert_equal(expected4, actual4)
|
|
|
|
# NaT in non-dimensional coordinate
|
|
array["t"] = (
|
|
"x",
|
|
[
|
|
np.datetime64("2001-01-01"),
|
|
np.datetime64("2001-01-01"),
|
|
np.datetime64("NaT"),
|
|
],
|
|
)
|
|
expected5 = xr.DataArray(3, [("t", [np.datetime64("2001-01-01")])])
|
|
actual5 = array.groupby("t").sum()
|
|
assert_equal(expected5, actual5)
|
|
|
|
# test for repeated coordinate labels
|
|
array = xr.DataArray([0, 1, 2, 4, 3, 4], [("x", [np.nan, 1, 1, np.nan, 2, np.nan])])
|
|
expected6 = xr.DataArray([3, 3], [("x", [1, 2])])
|
|
actual6 = array.groupby("x").sum()
|
|
assert_equal(expected6, actual6)
|
|
|
|
|
|
def test_groupby_grouping_errors() -> None:
|
|
dataset = xr.Dataset({"foo": ("x", [1, 1, 1])}, {"x": [1, 2, 3]})
|
|
with pytest.raises(
|
|
ValueError, match=r"None of the data falls within bins with edges"
|
|
):
|
|
dataset.groupby_bins("x", bins=[0.1, 0.2, 0.3])
|
|
|
|
with pytest.raises(
|
|
ValueError, match=r"None of the data falls within bins with edges"
|
|
):
|
|
dataset.to_dataarray().groupby_bins("x", bins=[0.1, 0.2, 0.3])
|
|
|
|
with pytest.raises(ValueError, match=r"All bin edges are NaN."):
|
|
dataset.groupby_bins("x", bins=[np.nan, np.nan, np.nan])
|
|
|
|
with pytest.raises(ValueError, match=r"All bin edges are NaN."):
|
|
dataset.to_dataarray().groupby_bins("x", bins=[np.nan, np.nan, np.nan])
|
|
|
|
with pytest.raises(ValueError, match=r"Failed to group data."):
|
|
dataset.groupby(dataset.foo * np.nan)
|
|
|
|
with pytest.raises(ValueError, match=r"Failed to group data."):
|
|
dataset.to_dataarray().groupby(dataset.foo * np.nan)
|
|
|
|
|
|
def test_groupby_reduce_dimension_error(array) -> None:
|
|
grouped = array.groupby("y")
|
|
# assert_identical(array, grouped.mean())
|
|
|
|
with pytest.raises(ValueError, match=r"cannot reduce over dimensions"):
|
|
grouped.mean("huh")
|
|
|
|
with pytest.raises(ValueError, match=r"cannot reduce over dimensions"):
|
|
grouped.mean(("x", "y", "asd"))
|
|
|
|
assert_identical(array.mean("x"), grouped.reduce(np.mean, "x"))
|
|
assert_allclose(array.mean(["x", "z"]), grouped.reduce(np.mean, ["x", "z"]))
|
|
|
|
grouped = array.groupby("y")
|
|
assert_identical(array, grouped.mean())
|
|
|
|
assert_identical(array.mean("x"), grouped.reduce(np.mean, "x"))
|
|
assert_allclose(array.mean(["x", "z"]), grouped.reduce(np.mean, ["x", "z"]))
|
|
|
|
|
|
def test_groupby_multiple_string_args(array) -> None:
|
|
with pytest.raises(TypeError):
|
|
array.groupby("x", squeeze="y")
|
|
|
|
|
|
def test_groupby_bins_timeseries() -> None:
|
|
ds = xr.Dataset()
|
|
ds["time"] = xr.DataArray(
|
|
pd.date_range("2010-08-01", "2010-08-15", freq="15min"), dims="time"
|
|
)
|
|
ds["val"] = xr.DataArray(np.ones(ds["time"].shape), dims="time")
|
|
time_bins = pd.date_range(start="2010-08-01", end="2010-08-15", freq="24h")
|
|
actual = ds.groupby_bins("time", time_bins).sum()
|
|
expected = xr.DataArray(
|
|
96 * np.ones((14,)),
|
|
dims=["time_bins"],
|
|
coords={"time_bins": pd.cut(time_bins, time_bins).categories}, # type: ignore[arg-type]
|
|
).to_dataset(name="val")
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_groupby_none_group_name() -> None:
|
|
# GH158
|
|
# xarray should not fail if a DataArray's name attribute is None
|
|
|
|
data = np.arange(10) + 10
|
|
da = xr.DataArray(data) # da.name = None
|
|
key = xr.DataArray(np.floor_divide(data, 2))
|
|
|
|
mean = da.groupby(key).mean()
|
|
assert "group" in mean.dims
|
|
|
|
|
|
def test_groupby_getitem(dataset) -> None:
|
|
assert_identical(dataset.sel(x=["a"]), dataset.groupby("x")["a"])
|
|
assert_identical(dataset.sel(z=[1]), dataset.groupby("z")[1])
|
|
assert_identical(dataset.foo.sel(x=["a"]), dataset.foo.groupby("x")["a"])
|
|
assert_identical(dataset.foo.sel(z=[1]), dataset.foo.groupby("z")[1])
|
|
assert_identical(dataset.cat.sel(y=[1]), dataset.cat.groupby("y")[1])
|
|
|
|
with pytest.raises(
|
|
NotImplementedError, match="Cannot broadcast 1d-only pandas categorical array."
|
|
):
|
|
dataset.groupby("boo")
|
|
dataset = dataset.drop_vars(["cat"])
|
|
actual = dataset.groupby("boo")["f"].unstack().transpose("x", "y", "z")
|
|
expected = dataset.sel(y=[1], z=[1, 2]).transpose("x", "y", "z")
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_groupby_dataset() -> None:
|
|
data = Dataset(
|
|
{"z": (["x", "y"], np.random.randn(3, 5))},
|
|
{"x": ("x", list("abc")), "c": ("x", [0, 1, 0]), "y": range(5)},
|
|
)
|
|
groupby = data.groupby("x")
|
|
assert len(groupby) == 3
|
|
expected_groups = {"a": slice(0, 1), "b": slice(1, 2), "c": slice(2, 3)}
|
|
assert groupby.groups == expected_groups
|
|
expected_items = [
|
|
("a", data.isel(x=[0])),
|
|
("b", data.isel(x=[1])),
|
|
("c", data.isel(x=[2])),
|
|
]
|
|
for actual1, expected1 in zip(groupby, expected_items, strict=True):
|
|
assert actual1[0] == expected1[0]
|
|
assert_equal(actual1[1], expected1[1])
|
|
|
|
def identity(x):
|
|
return x
|
|
|
|
for k in ["x", "c", "y"]:
|
|
actual2 = data.groupby(k).map(identity)
|
|
assert_equal(data, actual2)
|
|
|
|
|
|
def test_groupby_dataset_returns_new_type() -> None:
|
|
data = Dataset({"z": (["x", "y"], np.random.randn(3, 5))})
|
|
|
|
actual1 = data.groupby("x").map(lambda ds: ds["z"])
|
|
expected1 = data["z"]
|
|
assert_identical(expected1, actual1)
|
|
|
|
actual2 = data["z"].groupby("x").map(lambda x: x.to_dataset())
|
|
expected2 = data
|
|
assert_identical(expected2, actual2)
|
|
|
|
|
|
def test_groupby_dataset_iter() -> None:
|
|
data = create_test_data()
|
|
for n, (t, sub) in enumerate(list(data.groupby("dim1"))[:3]):
|
|
assert data["dim1"][n] == t
|
|
assert_equal(data["var1"][[n]], sub["var1"])
|
|
assert_equal(data["var2"][[n]], sub["var2"])
|
|
assert_equal(data["var3"][:, [n]], sub["var3"])
|
|
|
|
|
|
def test_groupby_dataset_errors() -> None:
|
|
data = create_test_data()
|
|
with pytest.raises(TypeError, match=r"`group` must be"):
|
|
data.groupby(np.arange(10)) # type: ignore[arg-type,unused-ignore]
|
|
with pytest.raises(ValueError, match=r"length does not match"):
|
|
data.groupby(data["dim1"][:3])
|
|
with pytest.raises(TypeError, match=r"`group` must be"):
|
|
data.groupby(data.coords["dim1"].to_index()) # type: ignore[arg-type]
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
@pytest.mark.parametrize(
|
|
"by_func",
|
|
[
|
|
pytest.param(lambda x: x, id="group-by-string"),
|
|
pytest.param(lambda x: {x: UniqueGrouper()}, id="group-by-unique-grouper"),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("letters_as_coord", [True, False])
|
|
def test_groupby_dataset_reduce_ellipsis(
|
|
by_func, use_flox: bool, letters_as_coord: bool
|
|
) -> None:
|
|
data = Dataset(
|
|
{
|
|
"xy": (["x", "y"], np.random.randn(3, 4)),
|
|
"xonly": ("x", np.random.randn(3)),
|
|
"yonly": ("y", np.random.randn(4)),
|
|
"letters": ("y", ["a", "a", "b", "b"]),
|
|
}
|
|
)
|
|
|
|
if letters_as_coord:
|
|
data = data.set_coords("letters")
|
|
|
|
expected = data.mean("y")
|
|
expected["yonly"] = expected["yonly"].variable.set_dims({"x": 3})
|
|
gb = data.groupby(by_func("x"))
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.mean(...)
|
|
assert_allclose(expected, actual)
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.mean("y")
|
|
assert_allclose(expected, actual)
|
|
|
|
letters = data["letters"]
|
|
expected = Dataset(
|
|
{
|
|
"xy": data["xy"].groupby(letters).mean(...),
|
|
"xonly": (data["xonly"].mean().variable.set_dims({"letters": 2})),
|
|
"yonly": data["yonly"].groupby(letters).mean(),
|
|
}
|
|
)
|
|
gb = data.groupby(by_func("letters"))
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.mean(...)
|
|
assert_allclose(expected, actual)
|
|
|
|
|
|
def test_groupby_dataset_math() -> None:
|
|
def reorder_dims(x):
|
|
return x.transpose("dim1", "dim2", "dim3", "time")
|
|
|
|
ds = create_test_data()
|
|
ds["dim1"] = ds["dim1"]
|
|
grouped = ds.groupby("dim1")
|
|
|
|
expected = reorder_dims(ds + ds.coords["dim1"])
|
|
actual = grouped + ds.coords["dim1"]
|
|
assert_identical(expected, reorder_dims(actual))
|
|
|
|
actual = ds.coords["dim1"] + grouped
|
|
assert_identical(expected, reorder_dims(actual))
|
|
|
|
ds2 = 2 * ds
|
|
expected = reorder_dims(ds + ds2)
|
|
actual = grouped + ds2
|
|
assert_identical(expected, reorder_dims(actual))
|
|
|
|
actual = ds2 + grouped
|
|
assert_identical(expected, reorder_dims(actual))
|
|
|
|
|
|
def test_groupby_math_more() -> None:
|
|
ds = create_test_data()
|
|
grouped = ds.groupby("numbers")
|
|
zeros = DataArray([0, 0, 0, 0], [("numbers", range(4))])
|
|
expected = (ds + Variable("dim3", np.zeros(10))).transpose(
|
|
"dim3", "dim1", "dim2", "time"
|
|
)
|
|
actual = grouped + zeros
|
|
assert_equal(expected, actual)
|
|
|
|
actual = zeros + grouped
|
|
assert_equal(expected, actual)
|
|
|
|
with pytest.raises(ValueError, match=r"incompat.* grouped binary"):
|
|
grouped + ds
|
|
with pytest.raises(ValueError, match=r"incompat.* grouped binary"):
|
|
ds + grouped
|
|
with pytest.raises(TypeError, match=r"only support binary ops"):
|
|
grouped + 1 # type: ignore[operator]
|
|
with pytest.raises(TypeError, match=r"only support binary ops"):
|
|
grouped + grouped # type: ignore[operator]
|
|
with pytest.raises(TypeError, match=r"in-place operations"):
|
|
ds += grouped # type: ignore[arg-type]
|
|
|
|
ds = Dataset(
|
|
{
|
|
"x": ("time", np.arange(100)),
|
|
"time": pd.date_range("2000-01-01", periods=100),
|
|
}
|
|
)
|
|
with pytest.raises(ValueError, match=r"incompat.* grouped binary"):
|
|
ds + ds.groupby("time.month")
|
|
|
|
|
|
def test_groupby_math_bitshift() -> None:
|
|
# create new dataset of int's only
|
|
ds = Dataset(
|
|
{
|
|
"x": ("index", np.ones(4, dtype=int)),
|
|
"y": ("index", np.ones(4, dtype=int) * -1),
|
|
"level": ("index", [1, 1, 2, 2]),
|
|
"index": [0, 1, 2, 3],
|
|
}
|
|
)
|
|
shift = DataArray([1, 2, 1], [("level", [1, 2, 8])])
|
|
|
|
left_expected = Dataset(
|
|
{
|
|
"x": ("index", [2, 2, 4, 4]),
|
|
"y": ("index", [-2, -2, -4, -4]),
|
|
"level": ("index", [2, 2, 8, 8]),
|
|
"index": [0, 1, 2, 3],
|
|
}
|
|
)
|
|
|
|
left_manual = []
|
|
for lev, group in ds.groupby("level"):
|
|
shifter = shift.sel(level=lev)
|
|
left_manual.append(group << shifter)
|
|
left_actual = xr.concat(left_manual, dim="index").reset_coords(names="level")
|
|
assert_equal(left_expected, left_actual)
|
|
|
|
left_actual = (ds.groupby("level") << shift).reset_coords(names="level")
|
|
assert_equal(left_expected, left_actual)
|
|
|
|
right_expected = Dataset(
|
|
{
|
|
"x": ("index", [0, 0, 2, 2]),
|
|
"y": ("index", [-1, -1, -2, -2]),
|
|
"level": ("index", [0, 0, 4, 4]),
|
|
"index": [0, 1, 2, 3],
|
|
}
|
|
)
|
|
right_manual = []
|
|
for lev, group in left_expected.groupby("level"):
|
|
shifter = shift.sel(level=lev)
|
|
right_manual.append(group >> shifter)
|
|
right_actual = xr.concat(right_manual, dim="index").reset_coords(names="level")
|
|
assert_equal(right_expected, right_actual)
|
|
|
|
right_actual = (left_expected.groupby("level") >> shift).reset_coords(names="level")
|
|
assert_equal(right_expected, right_actual)
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_groupby_bins_cut_kwargs(use_flox: bool) -> None:
|
|
da = xr.DataArray(np.arange(12).reshape(6, 2), dims=("x", "y"))
|
|
x_bins = (0, 2, 4, 6)
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = da.groupby_bins(
|
|
"x", bins=x_bins, include_lowest=True, right=False
|
|
).mean()
|
|
expected = xr.DataArray(
|
|
np.array([[1.0, 2.0], [5.0, 6.0], [9.0, 10.0]]),
|
|
dims=("x_bins", "y"),
|
|
coords={
|
|
"x_bins": ("x_bins", pd.IntervalIndex.from_breaks(x_bins, closed="left"))
|
|
},
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = da.groupby(
|
|
x=BinGrouper(bins=x_bins, include_lowest=True, right=False),
|
|
).mean()
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
@pytest.mark.parametrize("indexed_coord", [True, False])
|
|
@pytest.mark.parametrize(
|
|
["groupby_method", "args"],
|
|
(
|
|
("groupby_bins", ("x", np.arange(0, 8, 3))),
|
|
("groupby", ({"x": BinGrouper(bins=np.arange(0, 8, 3))},)),
|
|
),
|
|
)
|
|
def test_groupby_bins_math(groupby_method, args, indexed_coord) -> None:
|
|
N = 7
|
|
da = DataArray(np.random.random((N, N)), dims=("x", "y"))
|
|
if indexed_coord:
|
|
da["x"] = np.arange(N)
|
|
da["y"] = np.arange(N)
|
|
|
|
g = getattr(da, groupby_method)(*args)
|
|
mean = g.mean()
|
|
expected = da.isel(x=slice(1, None)) - mean.isel(x_bins=("x", [0, 0, 0, 1, 1, 1]))
|
|
actual = g - mean
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_groupby_math_nD_group() -> None:
|
|
N = 40
|
|
da = DataArray(
|
|
np.random.random((N, N)),
|
|
dims=("x", "y"),
|
|
coords={
|
|
"labels": (
|
|
"x",
|
|
np.repeat(["a", "b", "c", "d", "e", "f", "g", "h"], repeats=N // 8),
|
|
),
|
|
},
|
|
)
|
|
da["labels2d"] = xr.broadcast(da.labels, da)[0]
|
|
|
|
g = da.groupby("labels2d")
|
|
mean = g.mean()
|
|
expected = da - mean.sel(labels2d=da.labels2d)
|
|
expected["labels"] = expected.labels.broadcast_like(expected.labels2d)
|
|
actual = g - mean
|
|
assert_identical(expected, actual)
|
|
|
|
da["num"] = (
|
|
"x",
|
|
np.repeat([1, 2, 3, 4, 5, 6, 7, 8], repeats=N // 8),
|
|
)
|
|
da["num2d"] = xr.broadcast(da.num, da)[0]
|
|
g = da.groupby_bins("num2d", bins=[0, 4, 6])
|
|
mean = g.mean()
|
|
idxr = np.digitize(da.num2d, bins=(0, 4, 6), right=True)[:30, :] - 1
|
|
expanded_mean = mean.drop_vars("num2d_bins").isel(num2d_bins=(("x", "y"), idxr))
|
|
expected = da.isel(x=slice(30)) - expanded_mean
|
|
expected["labels"] = expected.labels.broadcast_like(expected.labels2d)
|
|
expected["num"] = expected.num.broadcast_like(expected.num2d)
|
|
expected["num2d_bins"] = (("x", "y"), mean.num2d_bins.data[idxr])
|
|
actual = g - mean
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_groupby_dataset_math_virtual() -> None:
|
|
ds = Dataset({"x": ("t", [1, 2, 3])}, {"t": pd.date_range("20100101", periods=3)})
|
|
grouped = ds.groupby("t.day")
|
|
actual = grouped - grouped.mean(...)
|
|
expected = Dataset({"x": ("t", [0, 0, 0])}, ds[["t", "t.day"]])
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_groupby_math_dim_order() -> None:
|
|
da = DataArray(
|
|
np.ones((10, 10, 12)),
|
|
dims=("x", "y", "time"),
|
|
coords={"time": pd.date_range("2001-01-01", periods=12, freq="6h")},
|
|
)
|
|
grouped = da.groupby("time.day")
|
|
result = grouped - grouped.mean()
|
|
assert result.dims == da.dims
|
|
|
|
|
|
def test_groupby_dataset_nan() -> None:
|
|
# nan should be excluded from groupby
|
|
ds = Dataset({"foo": ("x", [1, 2, 3, 4])}, {"bar": ("x", [1, 1, 2, np.nan])})
|
|
actual = ds.groupby("bar").mean(...)
|
|
expected = Dataset({"foo": ("bar", [1.5, 3]), "bar": [1, 2]})
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_groupby_dataset_order() -> None:
|
|
# groupby should preserve variables order
|
|
ds = Dataset()
|
|
for vn in ["a", "b", "c"]:
|
|
ds[vn] = DataArray(np.arange(10), dims=["t"])
|
|
data_vars_ref = list(ds.data_vars.keys())
|
|
ds = ds.groupby("t").mean(...)
|
|
data_vars = list(ds.data_vars.keys())
|
|
assert data_vars == data_vars_ref
|
|
# coords are now at the end of the list, so the test below fails
|
|
# all_vars = list(ds.variables.keys())
|
|
# all_vars_ref = list(ds.variables.keys())
|
|
# .assertEqual(all_vars, all_vars_ref)
|
|
|
|
|
|
def test_groupby_dataset_fillna() -> None:
|
|
ds = Dataset({"a": ("x", [np.nan, 1, np.nan, 3])}, {"x": [0, 1, 2, 3]})
|
|
expected = Dataset({"a": ("x", range(4))}, {"x": [0, 1, 2, 3]})
|
|
for target in [ds, expected]:
|
|
target.coords["b"] = ("x", [0, 0, 1, 1])
|
|
actual = ds.groupby("b").fillna(DataArray([0, 2], dims="b"))
|
|
assert_identical(expected, actual)
|
|
|
|
actual = ds.groupby("b").fillna(Dataset({"a": ("b", [0, 2])}))
|
|
assert_identical(expected, actual)
|
|
|
|
# attrs with groupby
|
|
ds.attrs["attr"] = "ds"
|
|
ds.a.attrs["attr"] = "da"
|
|
actual = ds.groupby("b").fillna(Dataset({"a": ("b", [0, 2])}))
|
|
assert actual.attrs == ds.attrs
|
|
assert actual.a.name == "a"
|
|
assert actual.a.attrs == ds.a.attrs
|
|
|
|
|
|
def test_groupby_dataset_where() -> None:
|
|
# groupby
|
|
ds = Dataset({"a": ("x", range(5))}, {"c": ("x", [0, 0, 1, 1, 1])})
|
|
cond = Dataset({"a": ("c", [True, False])})
|
|
expected = ds.copy(deep=True)
|
|
expected["a"].values = np.array([0, 1] + [np.nan] * 3)
|
|
actual = ds.groupby("c").where(cond)
|
|
assert_identical(expected, actual)
|
|
|
|
# attrs with groupby
|
|
ds.attrs["attr"] = "ds"
|
|
ds.a.attrs["attr"] = "da"
|
|
actual = ds.groupby("c").where(cond)
|
|
assert actual.attrs == ds.attrs
|
|
assert actual.a.name == "a"
|
|
assert actual.a.attrs == ds.a.attrs
|
|
|
|
|
|
def test_groupby_dataset_assign() -> None:
|
|
ds = Dataset({"a": ("x", range(3))}, {"b": ("x", ["A"] * 2 + ["B"])})
|
|
actual = ds.groupby("b").assign(c=lambda ds: 2 * ds.a)
|
|
expected = ds.merge({"c": ("x", [0, 2, 4])})
|
|
assert_identical(actual, expected)
|
|
|
|
actual = ds.groupby("b").assign(c=lambda ds: ds.a.sum())
|
|
expected = ds.merge({"c": ("x", [1, 1, 2])})
|
|
assert_identical(actual, expected)
|
|
|
|
actual = ds.groupby("b").assign_coords(c=lambda ds: ds.a.sum())
|
|
expected = expected.set_coords("c")
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_groupby_dataset_map_dataarray_func() -> None:
|
|
# regression GH6379
|
|
ds = Dataset({"foo": ("x", [1, 2, 3, 4])}, coords={"x": [0, 0, 1, 1]})
|
|
actual = ds.groupby("x").map(lambda grp: grp.foo.mean())
|
|
expected = DataArray([1.5, 3.5], coords={"x": [0, 1]}, dims="x", name="foo")
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_groupby_dataarray_map_dataset_func() -> None:
|
|
# regression GH6379
|
|
da = DataArray([1, 2, 3, 4], coords={"x": [0, 0, 1, 1]}, dims="x", name="foo")
|
|
actual = da.groupby("x").map(lambda grp: grp.mean().to_dataset())
|
|
expected = xr.Dataset({"foo": ("x", [1.5, 3.5])}, coords={"x": [0, 1]})
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
@requires_flox
|
|
@pytest.mark.parametrize("kwargs", [{"method": "map-reduce"}, {"engine": "numpy"}])
|
|
def test_groupby_flox_kwargs(kwargs) -> None:
|
|
ds = Dataset({"a": ("x", range(5))}, {"c": ("x", [0, 0, 1, 1, 1])})
|
|
with xr.set_options(use_flox=False):
|
|
expected = ds.groupby("c").mean()
|
|
with xr.set_options(use_flox=True):
|
|
actual = ds.groupby("c").mean(**kwargs)
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
class TestDataArrayGroupBy:
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self) -> None:
|
|
self.attrs = {"attr1": "value1", "attr2": 2929}
|
|
self.x = np.random.random((10, 20))
|
|
self.v = Variable(["x", "y"], self.x)
|
|
self.va = Variable(["x", "y"], self.x, self.attrs)
|
|
self.ds = Dataset({"foo": self.v})
|
|
self.dv = self.ds["foo"]
|
|
|
|
self.mindex = pd.MultiIndex.from_product(
|
|
[["a", "b"], [1, 2]], names=("level_1", "level_2")
|
|
)
|
|
self.mda = DataArray([0, 1, 2, 3], coords={"x": self.mindex}, dims="x")
|
|
|
|
self.da = self.dv.copy()
|
|
self.da.coords["abc"] = ("y", np.array(["a"] * 9 + ["c"] + ["b"] * 10))
|
|
self.da.coords["y"] = 20 + 100 * self.da["y"]
|
|
|
|
def test_stack_groupby_unsorted_coord(self) -> None:
|
|
data = [[0, 1], [2, 3]]
|
|
data_flat = [0, 1, 2, 3]
|
|
dims = ["x", "y"]
|
|
y_vals = [2, 3]
|
|
|
|
arr = xr.DataArray(data, dims=dims, coords={"y": y_vals})
|
|
actual1 = arr.stack(z=dims).groupby("z").first()
|
|
midx1 = pd.MultiIndex.from_product([[0, 1], [2, 3]], names=dims)
|
|
expected1 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx1})
|
|
assert_equal(actual1, expected1)
|
|
|
|
# GH: 3287. Note that y coord values are not in sorted order.
|
|
arr = xr.DataArray(data, dims=dims, coords={"y": y_vals[::-1]})
|
|
actual2 = arr.stack(z=dims).groupby("z").first()
|
|
midx2 = pd.MultiIndex.from_product([[0, 1], [3, 2]], names=dims)
|
|
expected2 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx2})
|
|
assert_equal(actual2, expected2)
|
|
|
|
def test_groupby_iter(self) -> None:
|
|
for (act_x, act_dv), (exp_x, exp_ds) in zip(
|
|
self.dv.groupby("y"), self.ds.groupby("y"), strict=True
|
|
):
|
|
assert exp_x == act_x
|
|
assert_identical(exp_ds["foo"], act_dv)
|
|
for (_, exp_dv), (_, act_dv) in zip(
|
|
self.dv.groupby("x"), self.dv.groupby("x"), strict=True
|
|
):
|
|
assert_identical(exp_dv, act_dv)
|
|
|
|
def test_groupby_properties(self) -> None:
|
|
grouped = self.da.groupby("abc")
|
|
expected_groups = {"a": range(9), "c": [9], "b": range(10, 20)}
|
|
assert expected_groups.keys() == grouped.groups.keys()
|
|
for key in expected_groups:
|
|
expected_group = expected_groups[key]
|
|
actual_group = grouped.groups[key]
|
|
|
|
# TODO: array_api doesn't allow slice:
|
|
assert not isinstance(expected_group, slice)
|
|
assert not isinstance(actual_group, slice)
|
|
|
|
np.testing.assert_array_equal(expected_group, actual_group)
|
|
assert 3 == len(grouped)
|
|
|
|
@pytest.mark.parametrize(
|
|
"by, use_da", [("x", False), ("y", False), ("y", True), ("abc", False)]
|
|
)
|
|
@pytest.mark.parametrize("shortcut", [True, False])
|
|
def test_groupby_map_identity(self, by, use_da, shortcut) -> None:
|
|
expected = self.da
|
|
if use_da:
|
|
by = expected.coords[by]
|
|
|
|
def identity(x):
|
|
return x
|
|
|
|
grouped = expected.groupby(by)
|
|
actual = grouped.map(identity, shortcut=shortcut)
|
|
assert_identical(expected, actual)
|
|
|
|
def test_groupby_sum(self) -> None:
|
|
array = self.da
|
|
grouped = array.groupby("abc")
|
|
|
|
expected_sum_all = Dataset(
|
|
{
|
|
"foo": Variable(
|
|
["abc"],
|
|
np.array(
|
|
[
|
|
self.x[:, :9].sum(),
|
|
self.x[:, 10:].sum(),
|
|
self.x[:, 9:10].sum(),
|
|
]
|
|
).T,
|
|
),
|
|
"abc": Variable(["abc"], np.array(["a", "b", "c"])),
|
|
}
|
|
)["foo"]
|
|
assert_allclose(expected_sum_all, grouped.reduce(np.sum, dim=...))
|
|
assert_allclose(expected_sum_all, grouped.sum(...))
|
|
|
|
expected = DataArray(
|
|
[
|
|
array["y"].values[idx].sum()
|
|
for idx in [slice(9), slice(10, None), slice(9, 10)]
|
|
],
|
|
[["a", "b", "c"]],
|
|
["abc"],
|
|
)
|
|
actual = array["y"].groupby("abc").map(np.sum)
|
|
assert_allclose(expected, actual)
|
|
actual = array["y"].groupby("abc").sum(...)
|
|
assert_allclose(expected, actual)
|
|
|
|
expected_sum_axis1 = Dataset(
|
|
{
|
|
"foo": (
|
|
["x", "abc"],
|
|
np.array(
|
|
[
|
|
self.x[:, :9].sum(1),
|
|
self.x[:, 10:].sum(1),
|
|
self.x[:, 9:10].sum(1),
|
|
]
|
|
).T,
|
|
),
|
|
"abc": Variable(["abc"], np.array(["a", "b", "c"])),
|
|
}
|
|
)["foo"]
|
|
assert_allclose(expected_sum_axis1, grouped.reduce(np.sum, "y"))
|
|
assert_allclose(expected_sum_axis1, grouped.sum("y"))
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
@pytest.mark.parametrize("shuffle", [True, False])
|
|
@pytest.mark.parametrize(
|
|
"chunk",
|
|
[
|
|
pytest.param(
|
|
True, marks=pytest.mark.skipif(not has_dask, reason="no dask")
|
|
),
|
|
False,
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("method", ["sum", "mean", "median"])
|
|
def test_groupby_reductions(
|
|
self, use_flox: bool, method: str, shuffle: bool, chunk: bool
|
|
) -> None:
|
|
if shuffle and chunk and not has_dask_ge_2024_08_1:
|
|
pytest.skip()
|
|
|
|
array = self.da
|
|
if chunk:
|
|
array.data = array.chunk({"y": 5}).data
|
|
reduction = getattr(np, method)
|
|
expected = Dataset(
|
|
{
|
|
"foo": Variable(
|
|
["x", "abc"],
|
|
np.array(
|
|
[
|
|
reduction(self.x[:, :9], axis=-1),
|
|
reduction(self.x[:, 10:], axis=-1),
|
|
reduction(self.x[:, 9:10], axis=-1),
|
|
]
|
|
).T,
|
|
),
|
|
"abc": Variable(["abc"], np.array(["a", "b", "c"])),
|
|
}
|
|
)["foo"]
|
|
|
|
with raise_if_dask_computes():
|
|
grouped = array.groupby("abc")
|
|
if shuffle:
|
|
grouped = grouped.shuffle_to_chunks().groupby("abc")
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = getattr(grouped, method)(dim="y")
|
|
assert_allclose(expected, actual)
|
|
|
|
def test_groupby_count(self) -> None:
|
|
array = DataArray(
|
|
[0, 0, np.nan, np.nan, 0, 0],
|
|
coords={"cat": ("x", ["a", "b", "b", "c", "c", "c"])},
|
|
dims="x",
|
|
)
|
|
actual = array.groupby("cat").count()
|
|
expected = DataArray([1, 1, 2], coords=[("cat", ["a", "b", "c"])])
|
|
assert_identical(actual, expected)
|
|
|
|
@pytest.mark.parametrize("shortcut", [True, False])
|
|
@pytest.mark.parametrize("keep_attrs", [None, True, False])
|
|
def test_groupby_reduce_keep_attrs(
|
|
self, shortcut: bool, keep_attrs: bool | None
|
|
) -> None:
|
|
array = self.da
|
|
array.attrs["foo"] = "bar"
|
|
|
|
actual = array.groupby("abc").reduce(
|
|
np.mean, keep_attrs=keep_attrs, shortcut=shortcut
|
|
)
|
|
with xr.set_options(use_flox=False):
|
|
expected = array.groupby("abc").mean(keep_attrs=keep_attrs)
|
|
assert_identical(expected, actual)
|
|
|
|
@pytest.mark.parametrize("keep_attrs", [None, True, False])
|
|
def test_groupby_keep_attrs(self, keep_attrs: bool | None) -> None:
|
|
array = self.da
|
|
array.attrs["foo"] = "bar"
|
|
|
|
with xr.set_options(use_flox=False):
|
|
expected = array.groupby("abc").mean(keep_attrs=keep_attrs)
|
|
with xr.set_options(use_flox=True):
|
|
actual = array.groupby("abc").mean(keep_attrs=keep_attrs)
|
|
|
|
# values are tested elsewhere, here we just check data
|
|
# TODO: add check_attrs kwarg to assert_allclose
|
|
actual.data = expected.data
|
|
assert_identical(expected, actual)
|
|
|
|
def test_groupby_map_center(self) -> None:
|
|
def center(x):
|
|
return x - np.mean(x)
|
|
|
|
array = self.da
|
|
grouped = array.groupby("abc")
|
|
|
|
expected_ds = array.to_dataset()
|
|
exp_data = np.hstack(
|
|
[center(self.x[:, :9]), center(self.x[:, 9:10]), center(self.x[:, 10:])]
|
|
)
|
|
expected_ds["foo"] = (["x", "y"], exp_data)
|
|
expected_centered = expected_ds["foo"]
|
|
assert_allclose(expected_centered, grouped.map(center))
|
|
|
|
def test_groupby_map_ndarray(self) -> None:
|
|
# regression test for #326
|
|
array = self.da
|
|
grouped = array.groupby("abc")
|
|
actual = grouped.map(np.asarray) # type: ignore[arg-type] # TODO: Not sure using np.asarray like this makes sense with array api
|
|
assert_equal(array, actual)
|
|
|
|
def test_groupby_map_changes_metadata(self) -> None:
|
|
def change_metadata(x):
|
|
x.coords["x"] = x.coords["x"] * 2
|
|
x.attrs["fruit"] = "lemon"
|
|
return x
|
|
|
|
array = self.da
|
|
grouped = array.groupby("abc")
|
|
actual = grouped.map(change_metadata)
|
|
expected = array.copy()
|
|
expected = change_metadata(expected)
|
|
assert_equal(expected, actual)
|
|
|
|
def test_groupby_math_squeeze(self) -> None:
|
|
array = self.da
|
|
grouped = array.groupby("x")
|
|
|
|
expected = array + array.coords["x"]
|
|
actual = grouped + array.coords["x"]
|
|
assert_identical(expected, actual)
|
|
|
|
actual = array.coords["x"] + grouped
|
|
assert_identical(expected, actual)
|
|
|
|
ds = array.coords["x"].to_dataset(name="X")
|
|
expected = array + ds
|
|
actual = grouped + ds
|
|
assert_identical(expected, actual)
|
|
|
|
actual = ds + grouped
|
|
assert_identical(expected, actual)
|
|
|
|
def test_groupby_math(self) -> None:
|
|
array = self.da
|
|
grouped = array.groupby("abc")
|
|
expected_agg = (grouped.mean(...) - np.arange(3)).rename(None)
|
|
actual = grouped - DataArray(range(3), [("abc", ["a", "b", "c"])])
|
|
actual_agg = actual.groupby("abc").mean(...)
|
|
assert_allclose(expected_agg, actual_agg)
|
|
|
|
with pytest.raises(TypeError, match=r"only support binary ops"):
|
|
grouped + 1 # type: ignore[type-var]
|
|
with pytest.raises(TypeError, match=r"only support binary ops"):
|
|
grouped + grouped # type: ignore[type-var]
|
|
with pytest.raises(TypeError, match=r"in-place operations"):
|
|
array += grouped # type: ignore[arg-type]
|
|
|
|
def test_groupby_math_not_aligned(self) -> None:
|
|
array = DataArray(
|
|
range(4), {"b": ("x", [0, 0, 1, 1]), "x": [0, 1, 2, 3]}, dims="x"
|
|
)
|
|
other = DataArray([10], coords={"b": [0]}, dims="b")
|
|
actual = array.groupby("b") + other
|
|
expected = DataArray([10, 11, np.nan, np.nan], array.coords)
|
|
assert_identical(expected, actual)
|
|
|
|
# regression test for #7797
|
|
other = array.groupby("b").sum()
|
|
actual = array.sel(x=[0, 1]).groupby("b") - other
|
|
expected = DataArray([-1, 0], {"b": ("x", [0, 0]), "x": [0, 1]}, dims="x")
|
|
assert_identical(expected, actual)
|
|
|
|
other = DataArray([10], coords={"c": 123, "b": [0]}, dims="b")
|
|
actual = array.groupby("b") + other
|
|
expected = DataArray([10, 11, np.nan, np.nan], array.coords)
|
|
expected.coords["c"] = (["x"], [123] * 2 + [np.nan] * 2)
|
|
assert_identical(expected, actual)
|
|
|
|
other_ds = Dataset({"a": ("b", [10])}, {"b": [0]})
|
|
actual_ds = array.groupby("b") + other_ds
|
|
expected_ds = Dataset({"a": ("x", [10, 11, np.nan, np.nan])}, array.coords)
|
|
assert_identical(expected_ds, actual_ds)
|
|
|
|
def test_groupby_restore_dim_order(self) -> None:
|
|
array = DataArray(
|
|
np.random.randn(5, 3),
|
|
coords={"a": ("x", range(5)), "b": ("y", range(3))},
|
|
dims=["x", "y"],
|
|
)
|
|
for by, expected_dims in [
|
|
("x", ("x", "y")),
|
|
("y", ("x", "y")),
|
|
("a", ("a", "y")),
|
|
("b", ("x", "b")),
|
|
]:
|
|
result = array.groupby(by).map(lambda x: x.squeeze())
|
|
assert result.dims == expected_dims
|
|
|
|
def test_groupby_restore_coord_dims(self) -> None:
|
|
array = DataArray(
|
|
np.random.randn(5, 3),
|
|
coords={
|
|
"a": ("x", range(5)),
|
|
"b": ("y", range(3)),
|
|
"c": (("x", "y"), np.random.randn(5, 3)),
|
|
},
|
|
dims=["x", "y"],
|
|
)
|
|
|
|
for by, expected_dims in [
|
|
("x", ("x", "y")),
|
|
("y", ("x", "y")),
|
|
("a", ("a", "y")),
|
|
("b", ("x", "b")),
|
|
]:
|
|
result = array.groupby(by, restore_coord_dims=True).map(
|
|
lambda x: x.squeeze()
|
|
)["c"]
|
|
assert result.dims == expected_dims
|
|
|
|
def test_groupby_first_and_last(self) -> None:
|
|
array = DataArray([1, 2, 3, 4, 5], dims="x")
|
|
by = DataArray(["a"] * 2 + ["b"] * 3, dims="x", name="ab")
|
|
|
|
expected = DataArray([1, 3], [("ab", ["a", "b"])])
|
|
actual = array.groupby(by).first()
|
|
assert_identical(expected, actual)
|
|
|
|
expected = DataArray([2, 5], [("ab", ["a", "b"])])
|
|
actual = array.groupby(by).last()
|
|
assert_identical(expected, actual)
|
|
|
|
array = DataArray(np.random.randn(5, 3), dims=["x", "y"])
|
|
expected = DataArray(array[[0, 2]], {"ab": ["a", "b"]}, ["ab", "y"])
|
|
actual = array.groupby(by).first()
|
|
assert_identical(expected, actual)
|
|
|
|
actual = array.groupby("x").first()
|
|
expected = array # should be a no-op
|
|
assert_identical(expected, actual)
|
|
|
|
def make_groupby_multidim_example_array(self) -> DataArray:
|
|
return DataArray(
|
|
[[[0, 1], [2, 3]], [[5, 10], [15, 20]]],
|
|
coords={
|
|
"lon": (["ny", "nx"], [[30, 40], [40, 50]]),
|
|
"lat": (["ny", "nx"], [[10, 10], [20, 20]]),
|
|
},
|
|
dims=["time", "ny", "nx"],
|
|
)
|
|
|
|
def test_groupby_multidim(self) -> None:
|
|
array = self.make_groupby_multidim_example_array()
|
|
for dim, expected_sum in [
|
|
("lon", DataArray([5, 28, 23], coords=[("lon", [30.0, 40.0, 50.0])])),
|
|
("lat", DataArray([16, 40], coords=[("lat", [10.0, 20.0])])),
|
|
]:
|
|
actual_sum = array.groupby(dim).sum(...)
|
|
assert_identical(expected_sum, actual_sum)
|
|
|
|
def test_groupby_multidim_map(self) -> None:
|
|
array = self.make_groupby_multidim_example_array()
|
|
actual = array.groupby("lon").map(lambda x: x - x.mean())
|
|
expected = DataArray(
|
|
[[[-2.5, -6.0], [-5.0, -8.5]], [[2.5, 3.0], [8.0, 8.5]]],
|
|
coords=array.coords,
|
|
dims=array.dims,
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
@pytest.mark.parametrize("coords", [np.arange(4), np.arange(4)[::-1], [2, 0, 3, 1]])
|
|
@pytest.mark.parametrize(
|
|
"cut_kwargs",
|
|
(
|
|
{"labels": None, "include_lowest": True},
|
|
{"labels": None, "include_lowest": False},
|
|
{"labels": ["a", "b"]},
|
|
{"labels": [1.2, 3.5]},
|
|
{"labels": ["b", "a"]},
|
|
),
|
|
)
|
|
def test_groupby_bins(
|
|
self,
|
|
coords: np.typing.ArrayLike,
|
|
use_flox: bool,
|
|
cut_kwargs: dict,
|
|
) -> None:
|
|
array = DataArray(
|
|
np.arange(4), dims="dim_0", coords={"dim_0": coords}, name="a"
|
|
)
|
|
# the first value should not be part of any group ("right" binning)
|
|
array[0] = 99
|
|
# bins follow conventions for pandas.cut
|
|
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html
|
|
bins = [0, 1.5, 5]
|
|
|
|
df = array.to_dataframe()
|
|
df["dim_0_bins"] = pd.cut(array["dim_0"], bins, **cut_kwargs) # type: ignore[call-overload]
|
|
|
|
expected_df = df.groupby("dim_0_bins", observed=True).sum()
|
|
# TODO: can't convert df with IntervalIndex to Xarray
|
|
expected = (
|
|
expected_df.reset_index(drop=True)
|
|
.to_xarray()
|
|
.assign_coords(index=np.array(expected_df.index))
|
|
.rename({"index": "dim_0_bins"})["a"]
|
|
)
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
gb = array.groupby_bins("dim_0", bins=bins, **cut_kwargs)
|
|
shuffled = gb.shuffle_to_chunks().groupby_bins(
|
|
"dim_0", bins=bins, **cut_kwargs
|
|
)
|
|
actual = gb.sum()
|
|
assert_identical(expected, actual)
|
|
assert_identical(expected, shuffled.sum())
|
|
|
|
actual = gb.map(lambda x: x.sum())
|
|
assert_identical(expected, actual)
|
|
assert_identical(expected, shuffled.map(lambda x: x.sum()))
|
|
|
|
# make sure original array dims are unchanged
|
|
assert len(array.dim_0) == 4
|
|
|
|
def test_groupby_bins_ellipsis(self) -> None:
|
|
da = xr.DataArray(np.ones((2, 3, 4)))
|
|
bins = [-1, 0, 1, 2]
|
|
with xr.set_options(use_flox=False):
|
|
actual = da.groupby_bins("dim_0", bins).mean(...)
|
|
with xr.set_options(use_flox=True):
|
|
expected = da.groupby_bins("dim_0", bins).mean(...)
|
|
assert_allclose(actual, expected)
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_groupby_bins_gives_correct_subset(self, use_flox: bool) -> None:
|
|
# GH7766
|
|
rng = np.random.default_rng(42)
|
|
coords = rng.normal(5, 5, 1000)
|
|
bins = np.logspace(-4, 1, 10)
|
|
labels = [
|
|
"one",
|
|
"two",
|
|
"three",
|
|
"four",
|
|
"five",
|
|
"six",
|
|
"seven",
|
|
"eight",
|
|
"nine",
|
|
]
|
|
# xArray
|
|
# Make a mock dataarray
|
|
darr = xr.DataArray(coords, coords=[coords], dims=["coords"])
|
|
expected = xr.DataArray(
|
|
[np.nan, np.nan, 1, 1, 1, 8, 31, 104, 542],
|
|
dims="coords_bins",
|
|
coords={"coords_bins": labels},
|
|
)
|
|
gb = darr.groupby_bins("coords", bins, labels=labels)
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.count()
|
|
assert_identical(actual, expected)
|
|
|
|
def test_groupby_bins_empty(self) -> None:
|
|
array = DataArray(np.arange(4), [("x", range(4))])
|
|
# one of these bins will be empty
|
|
bins = [0, 4, 5]
|
|
bin_coords = pd.cut(array["x"], bins).categories # type: ignore[call-overload]
|
|
actual = array.groupby_bins("x", bins).sum()
|
|
expected = DataArray([6, np.nan], dims="x_bins", coords={"x_bins": bin_coords})
|
|
assert_identical(expected, actual)
|
|
# make sure original array is unchanged
|
|
# (was a problem in earlier versions)
|
|
assert len(array.x) == 4
|
|
|
|
def test_groupby_bins_multidim(self) -> None:
|
|
array = self.make_groupby_multidim_example_array()
|
|
bins = [0, 15, 20]
|
|
bin_coords = pd.cut(array["lat"].values.flat, bins).categories # type: ignore[call-overload]
|
|
expected = DataArray([16, 40], dims="lat_bins", coords={"lat_bins": bin_coords})
|
|
actual = array.groupby_bins("lat", bins).map(lambda x: x.sum())
|
|
assert_identical(expected, actual)
|
|
# modify the array coordinates to be non-monotonic after unstacking
|
|
array["lat"].data = np.array([[10.0, 20.0], [20.0, 10.0]])
|
|
expected = DataArray([28, 28], dims="lat_bins", coords={"lat_bins": bin_coords})
|
|
actual = array.groupby_bins("lat", bins).map(lambda x: x.sum())
|
|
assert_identical(expected, actual)
|
|
|
|
bins = [-2, -1, 0, 1, 2]
|
|
field = DataArray(np.ones((5, 3)), dims=("x", "y"))
|
|
by = DataArray(
|
|
np.array([[-1.5, -1.5, 0.5, 1.5, 1.5] * 3]).reshape(5, 3), dims=("x", "y")
|
|
)
|
|
actual = field.groupby_bins(by, bins=bins).count()
|
|
|
|
bincoord = np.array(
|
|
[
|
|
pd.Interval(left, right, closed="right")
|
|
for left, right in pairwise(bins)
|
|
],
|
|
dtype=object,
|
|
)
|
|
expected = DataArray(
|
|
np.array([6, np.nan, 3, 6]),
|
|
dims="group_bins",
|
|
coords={"group_bins": bincoord},
|
|
)
|
|
assert_identical(actual, expected)
|
|
|
|
def test_groupby_bins_sort(self) -> None:
|
|
data = xr.DataArray(
|
|
np.arange(100), dims="x", coords={"x": np.linspace(-100, 100, num=100)}
|
|
)
|
|
binned_mean = data.groupby_bins("x", bins=11).mean()
|
|
assert binned_mean.to_index().is_monotonic_increasing
|
|
|
|
with xr.set_options(use_flox=True):
|
|
actual = data.groupby_bins("x", bins=11).count()
|
|
with xr.set_options(use_flox=False):
|
|
expected = data.groupby_bins("x", bins=11).count()
|
|
assert_identical(actual, expected)
|
|
|
|
def test_groupby_assign_coords(self) -> None:
|
|
array = DataArray([1, 2, 3, 4], {"c": ("x", [0, 0, 1, 1])}, dims="x")
|
|
actual = array.groupby("c").assign_coords(d=lambda a: a.mean())
|
|
expected = array.copy()
|
|
expected.coords["d"] = ("x", [1.5, 1.5, 3.5, 3.5])
|
|
assert_identical(actual, expected)
|
|
|
|
def test_groupby_fillna(self) -> None:
|
|
a = DataArray([np.nan, 1, np.nan, 3], coords={"x": range(4)}, dims="x")
|
|
fill_value = DataArray([0, 1], dims="y")
|
|
actual = a.fillna(fill_value)
|
|
expected = DataArray(
|
|
[[0, 1], [1, 1], [0, 1], [3, 3]], coords={"x": range(4)}, dims=("x", "y")
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
b = DataArray(range(4), coords={"x": range(4)}, dims="x")
|
|
expected = b.copy()
|
|
for target in [a, expected]:
|
|
target.coords["b"] = ("x", [0, 0, 1, 1])
|
|
actual = a.groupby("b").fillna(DataArray([0, 2], dims="b"))
|
|
assert_identical(expected, actual)
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_groupby_fastpath_for_monotonic(self, use_flox: bool) -> None:
|
|
# Fixes https://github.com/pydata/xarray/issues/6220
|
|
# Fixes https://github.com/pydata/xarray/issues/9279
|
|
index = [1, 2, 3, 4, 7, 9, 10]
|
|
array = DataArray(np.arange(len(index)), [("idx", index)])
|
|
array_rev = array.copy().assign_coords({"idx": index[::-1]})
|
|
fwd = array.groupby("idx", squeeze=False)
|
|
rev = array_rev.groupby("idx", squeeze=False)
|
|
|
|
for gb in [fwd, rev]:
|
|
assert all(isinstance(elem, slice) for elem in gb.encoded.group_indices)
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
assert_identical(fwd.sum(), array)
|
|
assert_identical(rev.sum(), array_rev)
|
|
|
|
|
|
class TestDataArrayResample:
|
|
@pytest.mark.parametrize("shuffle", [True, False])
|
|
@pytest.mark.parametrize("use_cftime", [True, False])
|
|
@pytest.mark.parametrize(
|
|
"resample_freq",
|
|
[
|
|
"24h",
|
|
"123456s",
|
|
"1234567890us",
|
|
pd.Timedelta(hours=2),
|
|
pd.offsets.MonthBegin(),
|
|
pd.offsets.Second(123456),
|
|
datetime.timedelta(days=1, hours=6),
|
|
],
|
|
)
|
|
def test_resample(
|
|
self, use_cftime: bool, shuffle: bool, resample_freq: ResampleCompatible
|
|
) -> None:
|
|
if use_cftime and not has_cftime:
|
|
pytest.skip()
|
|
times = xr.date_range(
|
|
"2000-01-01", freq="6h", periods=10, use_cftime=use_cftime
|
|
)
|
|
|
|
def resample_as_pandas(array, *args, **kwargs):
|
|
array_ = array.copy(deep=True)
|
|
if use_cftime:
|
|
array_["time"] = times.to_datetimeindex(time_unit="ns")
|
|
result = DataArray.from_series(
|
|
array_.to_series().resample(*args, **kwargs).mean()
|
|
)
|
|
if use_cftime:
|
|
result = result.convert_calendar(
|
|
calendar="standard", use_cftime=use_cftime
|
|
)
|
|
return result
|
|
|
|
array = DataArray(np.arange(10), [("time", times)])
|
|
|
|
rs = array.resample(time=resample_freq)
|
|
shuffled = rs.shuffle_to_chunks().resample(time=resample_freq)
|
|
actual = rs.mean()
|
|
expected = resample_as_pandas(array, resample_freq)
|
|
assert_identical(expected, actual)
|
|
assert_identical(expected, shuffled.mean())
|
|
|
|
assert_identical(expected, rs.reduce(np.mean))
|
|
assert_identical(expected, shuffled.reduce(np.mean))
|
|
|
|
rs = array.resample(time="24h", closed="right")
|
|
actual = rs.mean()
|
|
shuffled = rs.shuffle_to_chunks().resample(time="24h", closed="right")
|
|
expected = resample_as_pandas(array, "24h", closed="right")
|
|
assert_identical(expected, actual)
|
|
assert_identical(expected, shuffled.mean())
|
|
|
|
with pytest.raises(ValueError, match=r"Index must be monotonic"):
|
|
array[[2, 0, 1]].resample(time=resample_freq)
|
|
|
|
reverse = array.isel(time=slice(-1, None, -1))
|
|
with pytest.raises(ValueError):
|
|
reverse.resample(time=resample_freq).mean()
|
|
|
|
@pytest.mark.parametrize("use_cftime", [True, False])
|
|
def test_resample_doctest(self, use_cftime: bool) -> None:
|
|
# run the doctest example here so we are not surprised
|
|
if use_cftime and not has_cftime:
|
|
pytest.skip()
|
|
|
|
da = xr.DataArray(
|
|
np.array([1, 2, 3, 1, 2, np.nan]),
|
|
dims="time",
|
|
coords=dict(
|
|
time=(
|
|
"time",
|
|
xr.date_range(
|
|
"2001-01-01", freq="ME", periods=6, use_cftime=use_cftime
|
|
),
|
|
),
|
|
labels=("time", np.array(["a", "b", "c", "c", "b", "a"])),
|
|
),
|
|
)
|
|
actual = da.resample(time="3ME").count()
|
|
expected = DataArray(
|
|
[1, 3, 1],
|
|
dims="time",
|
|
coords={
|
|
"time": xr.date_range(
|
|
"2001-01-01", freq="3ME", periods=3, use_cftime=use_cftime
|
|
)
|
|
},
|
|
)
|
|
assert_identical(actual, expected)
|
|
|
|
def test_da_resample_func_args(self) -> None:
|
|
def func(arg1, arg2, arg3=0.0):
|
|
return arg1.mean("time") + arg2 + arg3
|
|
|
|
times = pd.date_range("2000", periods=3, freq="D")
|
|
da = xr.DataArray([1.0, 1.0, 1.0], coords=[times], dims=["time"])
|
|
expected = xr.DataArray([3.0, 3.0, 3.0], coords=[times], dims=["time"])
|
|
actual = da.resample(time="D").map(func, args=(1.0,), arg3=1.0)
|
|
assert_identical(actual, expected)
|
|
|
|
def test_resample_first(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
array = DataArray(np.arange(10), [("time", times)])
|
|
|
|
# resample to same frequency
|
|
actual = array.resample(time="6h").first()
|
|
assert_identical(array, actual)
|
|
|
|
actual = array.resample(time="1D").first()
|
|
expected = DataArray([0, 4, 8], [("time", times[::4])])
|
|
assert_identical(expected, actual)
|
|
|
|
# verify that labels don't use the first value
|
|
actual = array.resample(time="24h").first()
|
|
expected = DataArray(array.to_series().resample("24h").first())
|
|
assert_identical(expected, actual)
|
|
|
|
# missing values
|
|
array = array.astype(float)
|
|
array[:2] = np.nan
|
|
actual = array.resample(time="1D").first()
|
|
expected = DataArray([2, 4, 8], [("time", times[::4])])
|
|
assert_identical(expected, actual)
|
|
|
|
actual = array.resample(time="1D").first(skipna=False)
|
|
expected = DataArray([np.nan, 4, 8], [("time", times[::4])])
|
|
assert_identical(expected, actual)
|
|
|
|
# regression test for https://stackoverflow.com/questions/33158558/
|
|
array = Dataset({"time": times})["time"]
|
|
actual = array.resample(time="1D").last()
|
|
expected_times = pd.to_datetime(
|
|
["2000-01-01T18", "2000-01-02T18", "2000-01-03T06"], unit="ns"
|
|
)
|
|
expected = DataArray(expected_times, [("time", times[::4])], name="time")
|
|
assert_identical(expected, actual)
|
|
|
|
def test_resample_bad_resample_dim(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
array = DataArray(np.arange(10), [("__resample_dim__", times)])
|
|
with pytest.raises(ValueError, match=r"Proxy resampling dimension"):
|
|
array.resample(__resample_dim__="1D").first()
|
|
|
|
@requires_scipy
|
|
def test_resample_drop_nondim_coords(self) -> None:
|
|
xs = np.arange(6)
|
|
ys = np.arange(3)
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=5)
|
|
data = np.tile(np.arange(5), (6, 3, 1))
|
|
xx, yy = np.meshgrid(xs * 5, ys * 2.5)
|
|
tt = np.arange(len(times), dtype=int)
|
|
array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time"))
|
|
xcoord = DataArray(xx.T, {"x": xs, "y": ys}, ("x", "y"))
|
|
ycoord = DataArray(yy.T, {"x": xs, "y": ys}, ("x", "y"))
|
|
tcoord = DataArray(tt, {"time": times}, ("time",))
|
|
ds = Dataset({"data": array, "xc": xcoord, "yc": ycoord, "tc": tcoord})
|
|
ds = ds.set_coords(["xc", "yc", "tc"])
|
|
|
|
# Select the data now, with the auxiliary coordinates in place
|
|
array = ds["data"]
|
|
|
|
# Re-sample
|
|
actual = array.resample(time="12h", restore_coord_dims=True).mean("time")
|
|
assert "tc" not in actual.coords
|
|
|
|
# Up-sample - filling
|
|
actual = array.resample(time="1h", restore_coord_dims=True).ffill()
|
|
assert "tc" not in actual.coords
|
|
|
|
# Up-sample - interpolation
|
|
actual = array.resample(time="1h", restore_coord_dims=True).interpolate(
|
|
"linear"
|
|
)
|
|
assert "tc" not in actual.coords
|
|
|
|
def test_resample_keep_attrs(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
array = DataArray(np.ones(10), [("time", times)])
|
|
array.attrs["meta"] = "data"
|
|
|
|
result = array.resample(time="1D").mean(keep_attrs=True)
|
|
expected = DataArray([1, 1, 1], [("time", times[::4])], attrs=array.attrs)
|
|
assert_identical(result, expected)
|
|
|
|
def test_resample_skipna(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
array = DataArray(np.ones(10), [("time", times)])
|
|
array[1] = np.nan
|
|
|
|
result = array.resample(time="1D").mean(skipna=False)
|
|
expected = DataArray([np.nan, 1, 1], [("time", times[::4])])
|
|
assert_identical(result, expected)
|
|
|
|
def test_upsample(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=5)
|
|
array = DataArray(np.arange(5), [("time", times)])
|
|
|
|
# Forward-fill
|
|
actual = array.resample(time="3h").ffill()
|
|
expected = DataArray(array.to_series().resample("3h").ffill())
|
|
assert_identical(expected, actual)
|
|
|
|
# Backward-fill
|
|
actual = array.resample(time="3h").bfill()
|
|
expected = DataArray(array.to_series().resample("3h").bfill())
|
|
assert_identical(expected, actual)
|
|
|
|
# As frequency
|
|
actual = array.resample(time="3h").asfreq()
|
|
expected = DataArray(array.to_series().resample("3h").asfreq())
|
|
assert_identical(expected, actual)
|
|
|
|
# Pad
|
|
actual = array.resample(time="3h").pad()
|
|
expected = DataArray(array.to_series().resample("3h").ffill())
|
|
assert_identical(expected, actual)
|
|
|
|
# Nearest
|
|
rs = array.resample(time="3h")
|
|
actual = rs.nearest()
|
|
new_times = rs.groupers[0].full_index
|
|
expected = DataArray(array.reindex(time=new_times, method="nearest"))
|
|
assert_identical(expected, actual)
|
|
|
|
def test_upsample_nd(self) -> None:
|
|
# Same as before, but now we try on multi-dimensional DataArrays.
|
|
xs = np.arange(6)
|
|
ys = np.arange(3)
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=5)
|
|
data = np.tile(np.arange(5), (6, 3, 1))
|
|
array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time"))
|
|
|
|
# Forward-fill
|
|
actual = array.resample(time="3h").ffill()
|
|
expected_data = np.repeat(data, 2, axis=-1)
|
|
expected_times = times.to_series().resample("3h").asfreq().index
|
|
expected_data = expected_data[..., : len(expected_times)]
|
|
expected = DataArray(
|
|
expected_data,
|
|
{"time": expected_times, "x": xs, "y": ys},
|
|
("x", "y", "time"),
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
# Backward-fill
|
|
actual = array.resample(time="3h").ffill()
|
|
expected_data = np.repeat(np.flipud(data.T).T, 2, axis=-1)
|
|
expected_data = np.flipud(expected_data.T).T
|
|
expected_times = times.to_series().resample("3h").asfreq().index
|
|
expected_data = expected_data[..., : len(expected_times)]
|
|
expected = DataArray(
|
|
expected_data,
|
|
{"time": expected_times, "x": xs, "y": ys},
|
|
("x", "y", "time"),
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
# As frequency
|
|
actual = array.resample(time="3h").asfreq()
|
|
expected_data = np.repeat(data, 2, axis=-1).astype(float)[..., :-1]
|
|
expected_data[..., 1::2] = np.nan
|
|
expected_times = times.to_series().resample("3h").asfreq().index
|
|
expected = DataArray(
|
|
expected_data,
|
|
{"time": expected_times, "x": xs, "y": ys},
|
|
("x", "y", "time"),
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
# Pad
|
|
actual = array.resample(time="3h").pad()
|
|
expected_data = np.repeat(data, 2, axis=-1)
|
|
expected_data[..., 1::2] = expected_data[..., ::2]
|
|
expected_data = expected_data[..., :-1]
|
|
expected_times = times.to_series().resample("3h").asfreq().index
|
|
expected = DataArray(
|
|
expected_data,
|
|
{"time": expected_times, "x": xs, "y": ys},
|
|
("x", "y", "time"),
|
|
)
|
|
assert_identical(expected, actual)
|
|
|
|
def test_upsample_tolerance(self) -> None:
|
|
# Test tolerance keyword for upsample methods bfill, pad, nearest
|
|
times = pd.date_range("2000-01-01", freq="1D", periods=2)
|
|
times_upsampled = pd.date_range("2000-01-01", freq="6h", periods=5)
|
|
array = DataArray(np.arange(2), [("time", times)])
|
|
|
|
# Forward fill
|
|
actual = array.resample(time="6h").ffill(tolerance="12h")
|
|
expected = DataArray([0.0, 0.0, 0.0, np.nan, 1.0], [("time", times_upsampled)])
|
|
assert_identical(expected, actual)
|
|
|
|
# Backward fill
|
|
actual = array.resample(time="6h").bfill(tolerance="12h")
|
|
expected = DataArray([0.0, np.nan, 1.0, 1.0, 1.0], [("time", times_upsampled)])
|
|
assert_identical(expected, actual)
|
|
|
|
# Nearest
|
|
actual = array.resample(time="6h").nearest(tolerance="6h")
|
|
expected = DataArray([0, 0, np.nan, 1, 1], [("time", times_upsampled)])
|
|
assert_identical(expected, actual)
|
|
|
|
@requires_scipy
|
|
def test_upsample_interpolate(self) -> None:
|
|
from scipy.interpolate import interp1d
|
|
|
|
xs = np.arange(6)
|
|
ys = np.arange(3)
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=5)
|
|
|
|
z = np.arange(5) ** 2
|
|
data = np.tile(z, (6, 3, 1))
|
|
array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time"))
|
|
|
|
expected_times = times.to_series().resample("1h").asfreq().index
|
|
# Split the times into equal sub-intervals to simulate the 6 hour
|
|
# to 1 hour up-sampling
|
|
new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5)
|
|
kinds: list[InterpOptions] = [
|
|
"linear",
|
|
"nearest",
|
|
"zero",
|
|
"slinear",
|
|
"quadratic",
|
|
"cubic",
|
|
"polynomial",
|
|
]
|
|
for kind in kinds:
|
|
kwargs = {}
|
|
if kind == "polynomial":
|
|
kwargs["order"] = 1
|
|
actual = array.resample(time="1h").interpolate(kind, **kwargs)
|
|
# using interp1d, polynomial order is to set directly in kind using int
|
|
f = interp1d(
|
|
np.arange(len(times)),
|
|
data,
|
|
kind=kwargs["order"] if kind == "polynomial" else kind,
|
|
axis=-1,
|
|
bounds_error=True,
|
|
assume_sorted=True,
|
|
)
|
|
expected_data = f(new_times_idx)
|
|
expected = DataArray(
|
|
expected_data,
|
|
{"time": expected_times, "x": xs, "y": ys},
|
|
("x", "y", "time"),
|
|
)
|
|
# Use AllClose because there are some small differences in how
|
|
# we upsample timeseries versus the integer indexing as I've
|
|
# done here due to floating point arithmetic
|
|
assert_allclose(expected, actual, rtol=1e-16)
|
|
|
|
@requires_scipy
|
|
def test_upsample_interpolate_bug_2197(self) -> None:
|
|
dates = pd.date_range("2007-02-01", "2007-03-01", freq="D", unit="s")
|
|
da = xr.DataArray(np.arange(len(dates)), [("time", dates)])
|
|
result = da.resample(time="ME").interpolate("linear")
|
|
expected_times = np.array(
|
|
[np.datetime64("2007-02-28"), np.datetime64("2007-03-31")]
|
|
)
|
|
expected = xr.DataArray([27.0, np.nan], [("time", expected_times)])
|
|
assert_equal(result, expected)
|
|
|
|
@requires_scipy
|
|
def test_upsample_interpolate_regression_1605(self) -> None:
|
|
dates = pd.date_range("2016-01-01", "2016-03-31", freq="1D")
|
|
expected = xr.DataArray(
|
|
np.random.random((len(dates), 2, 3)),
|
|
dims=("time", "x", "y"),
|
|
coords={"time": dates},
|
|
)
|
|
actual = expected.resample(time="1D").interpolate("linear")
|
|
assert_allclose(actual, expected, rtol=1e-16)
|
|
|
|
@requires_dask
|
|
@requires_scipy
|
|
@pytest.mark.parametrize("chunked_time", [True, False])
|
|
def test_upsample_interpolate_dask(self, chunked_time: bool) -> None:
|
|
from scipy.interpolate import interp1d
|
|
|
|
xs = np.arange(6)
|
|
ys = np.arange(3)
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=5)
|
|
|
|
z = np.arange(5) ** 2
|
|
data = np.tile(z, (6, 3, 1))
|
|
array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time"))
|
|
chunks = {"x": 2, "y": 1}
|
|
if chunked_time:
|
|
chunks["time"] = 3
|
|
|
|
expected_times = times.to_series().resample("1h").asfreq().index
|
|
# Split the times into equal sub-intervals to simulate the 6 hour
|
|
# to 1 hour up-sampling
|
|
new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5)
|
|
kinds: list[InterpOptions] = [
|
|
"linear",
|
|
"nearest",
|
|
"zero",
|
|
"slinear",
|
|
"quadratic",
|
|
"cubic",
|
|
"polynomial",
|
|
]
|
|
for kind in kinds:
|
|
kwargs = {}
|
|
if kind == "polynomial":
|
|
kwargs["order"] = 1
|
|
actual = array.chunk(chunks).resample(time="1h").interpolate(kind, **kwargs)
|
|
actual = actual.compute()
|
|
# using interp1d, polynomial order is to set directly in kind using int
|
|
f = interp1d(
|
|
np.arange(len(times)),
|
|
data,
|
|
kind=kwargs["order"] if kind == "polynomial" else kind,
|
|
axis=-1,
|
|
bounds_error=True,
|
|
assume_sorted=True,
|
|
)
|
|
expected_data = f(new_times_idx)
|
|
expected = DataArray(
|
|
expected_data,
|
|
{"time": expected_times, "x": xs, "y": ys},
|
|
("x", "y", "time"),
|
|
)
|
|
# Use AllClose because there are some small differences in how
|
|
# we upsample timeseries versus the integer indexing as I've
|
|
# done here due to floating point arithmetic
|
|
assert_allclose(expected, actual, rtol=1e-16)
|
|
|
|
def test_resample_offset(self) -> None:
|
|
times = pd.date_range("2000-01-01T02:03:01", freq="6h", periods=10)
|
|
array = DataArray(np.arange(10), [("time", times)])
|
|
|
|
offset = pd.Timedelta("11h")
|
|
actual = array.resample(time="24h", offset=offset).mean()
|
|
expected = DataArray(array.to_series().resample("24h", offset=offset).mean())
|
|
assert_identical(expected, actual)
|
|
|
|
def test_resample_origin(self) -> None:
|
|
times = pd.date_range("2000-01-01T02:03:01", freq="6h", periods=10)
|
|
array = DataArray(np.arange(10), [("time", times)])
|
|
|
|
origin = "start"
|
|
actual = array.resample(time="24h", origin=origin).mean()
|
|
expected = DataArray(array.to_series().resample("24h", origin=origin).mean())
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
class TestDatasetResample:
|
|
@pytest.mark.parametrize("use_cftime", [True, False])
|
|
@pytest.mark.parametrize(
|
|
"resample_freq",
|
|
[
|
|
"24h",
|
|
"123456s",
|
|
"1234567890us",
|
|
pd.Timedelta(hours=2),
|
|
pd.offsets.MonthBegin(),
|
|
pd.offsets.Second(123456),
|
|
datetime.timedelta(days=1, hours=6),
|
|
],
|
|
)
|
|
def test_resample(
|
|
self, use_cftime: bool, resample_freq: ResampleCompatible
|
|
) -> None:
|
|
if use_cftime and not has_cftime:
|
|
pytest.skip()
|
|
times = xr.date_range(
|
|
"2000-01-01", freq="6h", periods=10, use_cftime=use_cftime
|
|
)
|
|
|
|
def resample_as_pandas(ds, *args, **kwargs):
|
|
ds_ = ds.copy(deep=True)
|
|
if use_cftime:
|
|
ds_["time"] = times.to_datetimeindex(time_unit="ns")
|
|
result = Dataset.from_dataframe(
|
|
ds_.to_dataframe().resample(*args, **kwargs).mean()
|
|
)
|
|
if use_cftime:
|
|
result = result.convert_calendar(
|
|
calendar="standard", use_cftime=use_cftime
|
|
)
|
|
return result
|
|
|
|
ds = Dataset(
|
|
{
|
|
"foo": ("time", np.random.randint(1, 1000, 10)),
|
|
"bar": ("time", np.random.randint(1, 1000, 10)),
|
|
"time": times,
|
|
}
|
|
)
|
|
|
|
actual = ds.resample(time=resample_freq).mean()
|
|
expected = resample_as_pandas(ds, resample_freq)
|
|
assert_identical(expected, actual)
|
|
|
|
actual = ds.resample(time=resample_freq).reduce(np.mean)
|
|
assert_identical(expected, actual)
|
|
|
|
actual = ds.resample(time=resample_freq, closed="right").mean()
|
|
expected = resample_as_pandas(ds, resample_freq, closed="right")
|
|
assert_identical(expected, actual)
|
|
|
|
with pytest.raises(ValueError, match=r"Index must be monotonic"):
|
|
ds.isel(time=[2, 0, 1]).resample(time=resample_freq)
|
|
|
|
reverse = ds.isel(time=slice(-1, None, -1))
|
|
with pytest.raises(ValueError):
|
|
reverse.resample(time=resample_freq).mean()
|
|
|
|
def test_resample_and_first(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
ds = Dataset(
|
|
{
|
|
"foo": (["time", "x", "y"], np.random.randn(10, 5, 3)),
|
|
"bar": ("time", np.random.randn(10), {"meta": "data"}),
|
|
"time": times,
|
|
}
|
|
)
|
|
|
|
actual = ds.resample(time="1D").first(keep_attrs=True)
|
|
expected = ds.isel(time=[0, 4, 8])
|
|
assert_identical(expected, actual)
|
|
|
|
# upsampling
|
|
expected_time = pd.date_range("2000-01-01", freq="3h", periods=19)
|
|
expected = ds.reindex(time=expected_time)
|
|
actual = ds.resample(time="3h")
|
|
for how in ["mean", "sum", "first", "last"]:
|
|
method = getattr(actual, how)
|
|
result = method()
|
|
assert_equal(expected, result)
|
|
for method in [np.mean]:
|
|
result = actual.reduce(method)
|
|
assert_equal(expected, result)
|
|
|
|
def test_resample_min_count(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
ds = Dataset(
|
|
{
|
|
"foo": (["time", "x", "y"], np.random.randn(10, 5, 3)),
|
|
"bar": ("time", np.random.randn(10), {"meta": "data"}),
|
|
"time": times,
|
|
}
|
|
)
|
|
# inject nan
|
|
ds["foo"] = xr.where(ds["foo"] > 2.0, np.nan, ds["foo"])
|
|
|
|
actual = ds.resample(time="1D").sum(min_count=1)
|
|
expected = xr.concat(
|
|
[
|
|
ds.isel(time=slice(i * 4, (i + 1) * 4)).sum("time", min_count=1)
|
|
for i in range(3)
|
|
],
|
|
dim=actual["time"],
|
|
)
|
|
assert_allclose(expected, actual)
|
|
|
|
def test_resample_by_mean_with_keep_attrs(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
ds = Dataset(
|
|
{
|
|
"foo": (["time", "x", "y"], np.random.randn(10, 5, 3)),
|
|
"bar": ("time", np.random.randn(10), {"meta": "data"}),
|
|
"time": times,
|
|
}
|
|
)
|
|
ds.attrs["dsmeta"] = "dsdata"
|
|
|
|
resampled_ds = ds.resample(time="1D").mean(keep_attrs=True)
|
|
actual = resampled_ds["bar"].attrs
|
|
expected = ds["bar"].attrs
|
|
assert expected == actual
|
|
|
|
actual = resampled_ds.attrs
|
|
expected = ds.attrs
|
|
assert expected == actual
|
|
|
|
def test_resample_by_mean_discarding_attrs(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
ds = Dataset(
|
|
{
|
|
"foo": (["time", "x", "y"], np.random.randn(10, 5, 3)),
|
|
"bar": ("time", np.random.randn(10), {"meta": "data"}),
|
|
"time": times,
|
|
}
|
|
)
|
|
ds.attrs["dsmeta"] = "dsdata"
|
|
|
|
resampled_ds = ds.resample(time="1D").mean(keep_attrs=False)
|
|
|
|
assert resampled_ds["bar"].attrs == {}
|
|
assert resampled_ds.attrs == {}
|
|
|
|
def test_resample_by_last_discarding_attrs(self) -> None:
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=10)
|
|
ds = Dataset(
|
|
{
|
|
"foo": (["time", "x", "y"], np.random.randn(10, 5, 3)),
|
|
"bar": ("time", np.random.randn(10), {"meta": "data"}),
|
|
"time": times,
|
|
}
|
|
)
|
|
ds.attrs["dsmeta"] = "dsdata"
|
|
|
|
resampled_ds = ds.resample(time="1D").last(keep_attrs=False)
|
|
|
|
assert resampled_ds["bar"].attrs == {}
|
|
assert resampled_ds.attrs == {}
|
|
|
|
@requires_scipy
|
|
def test_resample_drop_nondim_coords(self) -> None:
|
|
xs = np.arange(6)
|
|
ys = np.arange(3)
|
|
times = pd.date_range("2000-01-01", freq="6h", periods=5)
|
|
data = np.tile(np.arange(5), (6, 3, 1))
|
|
xx, yy = np.meshgrid(xs * 5, ys * 2.5)
|
|
tt = np.arange(len(times), dtype=int)
|
|
array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time"))
|
|
xcoord = DataArray(xx.T, {"x": xs, "y": ys}, ("x", "y"))
|
|
ycoord = DataArray(yy.T, {"x": xs, "y": ys}, ("x", "y"))
|
|
tcoord = DataArray(tt, {"time": times}, ("time",))
|
|
ds = Dataset({"data": array, "xc": xcoord, "yc": ycoord, "tc": tcoord})
|
|
ds = ds.set_coords(["xc", "yc", "tc"])
|
|
|
|
# Re-sample
|
|
actual = ds.resample(time="12h").mean("time")
|
|
assert "tc" not in actual.coords
|
|
|
|
# Up-sample - filling
|
|
actual = ds.resample(time="1h").ffill()
|
|
assert "tc" not in actual.coords
|
|
|
|
# Up-sample - interpolation
|
|
actual = ds.resample(time="1h").interpolate("linear")
|
|
assert "tc" not in actual.coords
|
|
|
|
def test_resample_ds_da_are_the_same(self) -> None:
|
|
time = pd.date_range("2000-01-01", freq="6h", periods=365 * 4)
|
|
ds = xr.Dataset(
|
|
{
|
|
"foo": (("time", "x"), np.random.randn(365 * 4, 5)),
|
|
"time": time,
|
|
"x": np.arange(5),
|
|
}
|
|
)
|
|
assert_allclose(
|
|
ds.resample(time="ME").mean()["foo"], ds.foo.resample(time="ME").mean()
|
|
)
|
|
|
|
def test_ds_resample_apply_func_args(self) -> None:
|
|
def func(arg1, arg2, arg3=0.0):
|
|
return arg1.mean("time") + arg2 + arg3
|
|
|
|
times = pd.date_range("2000", freq="D", periods=3)
|
|
ds = xr.Dataset({"foo": ("time", [1.0, 1.0, 1.0]), "time": times})
|
|
expected = xr.Dataset({"foo": ("time", [3.0, 3.0, 3.0]), "time": times})
|
|
actual = ds.resample(time="D").map(func, args=(1.0,), arg3=1.0)
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_groupby_cumsum() -> None:
|
|
ds = xr.Dataset(
|
|
{"foo": (("x",), [7, 3, 1, 1, 1, 1, 1])},
|
|
coords={"x": [0, 1, 2, 3, 4, 5, 6], "group_id": ("x", [0, 0, 1, 1, 2, 2, 2])},
|
|
)
|
|
actual = ds.groupby("group_id").cumsum(dim="x")
|
|
expected = xr.Dataset(
|
|
{
|
|
"foo": (("x",), [7, 10, 1, 2, 1, 2, 3]),
|
|
},
|
|
coords={
|
|
"x": [0, 1, 2, 3, 4, 5, 6],
|
|
"group_id": ds.group_id,
|
|
},
|
|
)
|
|
# TODO: Remove drop_vars when GH6528 is fixed
|
|
# when Dataset.cumsum propagates indexes, and the group variable?
|
|
assert_identical(expected.drop_vars(["x", "group_id"]), actual)
|
|
|
|
actual = ds.foo.groupby("group_id").cumsum(dim="x")
|
|
expected.coords["group_id"] = ds.group_id
|
|
expected.coords["x"] = np.arange(7)
|
|
assert_identical(expected.foo, actual)
|
|
|
|
|
|
def test_groupby_cumprod() -> None:
|
|
ds = xr.Dataset(
|
|
{"foo": (("x",), [7, 3, 0, 1, 1, 2, 1])},
|
|
coords={"x": [0, 1, 2, 3, 4, 5, 6], "group_id": ("x", [0, 0, 1, 1, 2, 2, 2])},
|
|
)
|
|
actual = ds.groupby("group_id").cumprod(dim="x")
|
|
expected = xr.Dataset(
|
|
{
|
|
"foo": (("x",), [7, 21, 0, 0, 1, 2, 2]),
|
|
},
|
|
coords={
|
|
"x": [0, 1, 2, 3, 4, 5, 6],
|
|
"group_id": ds.group_id,
|
|
},
|
|
)
|
|
# TODO: Remove drop_vars when GH6528 is fixed
|
|
# when Dataset.cumsum propagates indexes, and the group variable?
|
|
assert_identical(expected.drop_vars(["x", "group_id"]), actual)
|
|
|
|
actual = ds.foo.groupby("group_id").cumprod(dim="x")
|
|
expected.coords["group_id"] = ds.group_id
|
|
expected.coords["x"] = np.arange(7)
|
|
assert_identical(expected.foo, actual)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"method, expected_array",
|
|
[
|
|
("cumsum", [1.0, 2.0, 5.0, 6.0, 2.0, 2.0]),
|
|
("cumprod", [1.0, 2.0, 6.0, 6.0, 2.0, 2.0]),
|
|
],
|
|
)
|
|
def test_resample_cumsum(method: str, expected_array: list[float]) -> None:
|
|
ds = xr.Dataset(
|
|
{"foo": ("time", [1, 2, 3, 1, 2, np.nan])},
|
|
coords={
|
|
"time": xr.date_range("01-01-2001", freq="ME", periods=6, use_cftime=False),
|
|
},
|
|
)
|
|
actual = getattr(ds.resample(time="3ME"), method)(dim="time")
|
|
expected = xr.Dataset(
|
|
{"foo": (("time",), expected_array)},
|
|
coords={
|
|
"time": xr.date_range("01-01-2001", freq="ME", periods=6, use_cftime=False),
|
|
},
|
|
)
|
|
# TODO: Remove drop_vars when GH6528 is fixed
|
|
# when Dataset.cumsum propagates indexes, and the group variable?
|
|
assert_identical(expected.drop_vars(["time"]), actual)
|
|
|
|
actual = getattr(ds.foo.resample(time="3ME"), method)(dim="time")
|
|
expected.coords["time"] = ds.time
|
|
assert_identical(expected.drop_vars(["time"]).foo, actual)
|
|
|
|
|
|
def test_groupby_binary_op_regression() -> None:
|
|
# regression test for #7797
|
|
# monthly timeseries that should return "zero anomalies" everywhere
|
|
time = xr.date_range("2023-01-01", "2023-12-31", freq="MS")
|
|
data = np.linspace(-1, 1, 12)
|
|
x = xr.DataArray(data, coords={"time": time})
|
|
clim = xr.DataArray(data, coords={"month": np.arange(1, 13, 1)})
|
|
|
|
# seems to give the correct result if we use the full x, but not with a slice
|
|
x_slice = x.sel(time=["2023-04-01"])
|
|
|
|
# two typical ways of computing anomalies
|
|
anom_gb = x_slice.groupby("time.month") - clim
|
|
|
|
assert_identical(xr.zeros_like(anom_gb), anom_gb)
|
|
|
|
|
|
def test_groupby_multiindex_level() -> None:
|
|
# GH6836
|
|
midx = pd.MultiIndex.from_product([list("abc"), [0, 1]], names=("one", "two"))
|
|
mda = xr.DataArray(np.random.rand(6, 3), [("x", midx), ("y", range(3))])
|
|
groups = mda.groupby("one").groups
|
|
assert groups == {"a": [0, 1], "b": [2, 3], "c": [4, 5]}
|
|
|
|
|
|
@requires_flox
|
|
@pytest.mark.parametrize("func", ["sum", "prod"])
|
|
@pytest.mark.parametrize("skipna", [True, False])
|
|
@pytest.mark.parametrize("min_count", [None, 1])
|
|
def test_min_count_vs_flox(func: str, min_count: int | None, skipna: bool) -> None:
|
|
da = DataArray(
|
|
data=np.array([np.nan, 1, 1, np.nan, 1, 1]),
|
|
dims="x",
|
|
coords={"labels": ("x", np.array([1, 2, 3, 1, 2, 3]))},
|
|
)
|
|
|
|
gb = da.groupby("labels")
|
|
method = operator.methodcaller(func, min_count=min_count, skipna=skipna)
|
|
with xr.set_options(use_flox=True):
|
|
actual = method(gb)
|
|
with xr.set_options(use_flox=False):
|
|
expected = method(gb)
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_min_count_error(use_flox: bool) -> None:
|
|
if use_flox and not has_flox:
|
|
pytest.skip()
|
|
da = DataArray(
|
|
data=np.array([np.nan, 1, 1, np.nan, 1, 1]),
|
|
dims="x",
|
|
coords={"labels": ("x", np.array([1, 2, 3, 1, 2, 3]))},
|
|
)
|
|
with xr.set_options(use_flox=use_flox):
|
|
with pytest.raises(TypeError):
|
|
da.groupby("labels").mean(min_count=1)
|
|
|
|
|
|
@requires_dask
|
|
def test_groupby_math_auto_chunk() -> None:
|
|
da = xr.DataArray(
|
|
[[1, 2, 3], [1, 2, 3], [1, 2, 3]],
|
|
dims=("y", "x"),
|
|
coords={"label": ("x", [2, 2, 1])},
|
|
)
|
|
sub = xr.DataArray(
|
|
InaccessibleArray(np.array([1, 2])), dims="label", coords={"label": [1, 2]}
|
|
)
|
|
chunked = da.chunk(x=1, y=2)
|
|
chunked.label.load()
|
|
actual = chunked.groupby("label") - sub
|
|
assert actual.chunksizes == {"x": (1, 1, 1), "y": (2, 1)}
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_groupby_dim_no_dim_equal(use_flox: bool) -> None:
|
|
# https://github.com/pydata/xarray/issues/8263
|
|
da = DataArray(
|
|
data=[1, 2, 3, 4], dims="lat", coords={"lat": np.linspace(0, 1.01, 4)}
|
|
)
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual1 = da.drop_vars("lat").groupby("lat").sum()
|
|
actual2 = da.groupby("lat").sum()
|
|
assert_identical(actual1, actual2.drop_vars("lat"))
|
|
|
|
|
|
@requires_flox
|
|
def test_default_flox_method() -> None:
|
|
import flox.xarray
|
|
|
|
da = xr.DataArray([1, 2, 3], dims="x", coords={"label": ("x", [2, 2, 1])})
|
|
|
|
result = xr.DataArray([3, 3], dims="label", coords={"label": [1, 2]})
|
|
with mock.patch("flox.xarray.xarray_reduce", return_value=result) as mocked_reduce:
|
|
da.groupby("label").sum()
|
|
|
|
kwargs = mocked_reduce.call_args.kwargs
|
|
if Version(flox.__version__) < Version("0.9.0"):
|
|
assert kwargs["method"] == "cohorts"
|
|
else:
|
|
assert "method" not in kwargs
|
|
|
|
|
|
@requires_cftime
|
|
@pytest.mark.filterwarnings("ignore")
|
|
def test_cftime_resample_gh_9108():
|
|
import cftime
|
|
|
|
ds = Dataset(
|
|
{"pr": ("time", np.random.random((10,)))},
|
|
coords={"time": xr.date_range("0001-01-01", periods=10, freq="D")},
|
|
)
|
|
actual = ds.resample(time="ME").mean()
|
|
expected = ds.mean("time").expand_dims(
|
|
time=[cftime.DatetimeGregorian(1, 1, 31, 0, 0, 0, 0, has_year_zero=False)]
|
|
)
|
|
assert actual.time.data[0].has_year_zero == ds.time.data[0].has_year_zero
|
|
assert_equal(actual, expected)
|
|
|
|
|
|
def test_custom_grouper() -> None:
|
|
class YearGrouper(Grouper):
|
|
"""
|
|
An example re-implementation of ``.groupby("time.year")``.
|
|
"""
|
|
|
|
def factorize(self, group) -> EncodedGroups:
|
|
assert np.issubdtype(group.dtype, np.datetime64)
|
|
year = group.dt.year.data
|
|
codes_, uniques = pd.factorize(year)
|
|
codes = group.copy(data=codes_).rename("year")
|
|
return EncodedGroups(codes=codes, full_index=pd.Index(uniques))
|
|
|
|
def reset(self):
|
|
return type(self)()
|
|
|
|
da = xr.DataArray(
|
|
dims="time",
|
|
data=np.arange(20),
|
|
coords={"time": ("time", pd.date_range("2000-01-01", freq="3MS", periods=20))},
|
|
name="foo",
|
|
)
|
|
ds = da.to_dataset()
|
|
|
|
expected = ds.groupby("time.year").mean()
|
|
actual = ds.groupby(time=YearGrouper()).mean()
|
|
assert_identical(expected, actual)
|
|
|
|
actual = ds.groupby({"time": YearGrouper()}).mean()
|
|
assert_identical(expected, actual)
|
|
|
|
expected = ds.foo.groupby("time.year").mean()
|
|
actual = ds.foo.groupby(time=YearGrouper()).mean()
|
|
assert_identical(expected, actual)
|
|
|
|
actual = ds.foo.groupby({"time": YearGrouper()}).mean()
|
|
assert_identical(expected, actual)
|
|
|
|
for obj in [ds, ds.foo]:
|
|
with pytest.raises(ValueError):
|
|
obj.groupby("time.year", time=YearGrouper())
|
|
with pytest.raises(ValueError):
|
|
obj.groupby()
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_weather_data_resample(use_flox):
|
|
# from the docs
|
|
times = pd.date_range("2000-01-01", "2001-12-31", name="time")
|
|
annual_cycle = np.sin(2 * np.pi * (times.dayofyear.values / 365.25 - 0.28))
|
|
|
|
base = 10 + 15 * annual_cycle.reshape(-1, 1)
|
|
tmin_values = base + 3 * np.random.randn(annual_cycle.size, 3)
|
|
tmax_values = base + 10 + 3 * np.random.randn(annual_cycle.size, 3)
|
|
|
|
ds = xr.Dataset(
|
|
{
|
|
"tmin": (("time", "location"), tmin_values),
|
|
"tmax": (("time", "location"), tmax_values),
|
|
},
|
|
{
|
|
"time": ("time", times, {"time_key": "time_values"}),
|
|
"location": ("location", ["IA", "IN", "IL"], {"loc_key": "loc_value"}),
|
|
},
|
|
)
|
|
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = ds.resample(time="1MS").mean()
|
|
assert "location" in actual._indexes
|
|
|
|
gb = ds.groupby(time=TimeResampler(freq="1MS"), location=UniqueGrouper())
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.mean()
|
|
expected = ds.resample(time="1MS").mean().sortby("location")
|
|
assert_allclose(actual, expected)
|
|
assert actual.time.attrs == ds.time.attrs
|
|
assert actual.location.attrs == ds.location.attrs
|
|
|
|
assert expected.time.attrs == ds.time.attrs
|
|
assert expected.location.attrs == ds.location.attrs
|
|
|
|
|
|
@pytest.mark.parametrize("as_dataset", [True, False])
|
|
def test_multiple_groupers_string(as_dataset) -> None:
|
|
obj = DataArray(
|
|
np.array([1, 2, 3, 0, 2, np.nan]),
|
|
dims="d",
|
|
coords=dict(
|
|
labels1=("d", np.array(["a", "b", "c", "c", "b", "a"])),
|
|
labels2=("d", np.array(["x", "y", "z", "z", "y", "x"])),
|
|
),
|
|
name="foo",
|
|
)
|
|
|
|
if as_dataset:
|
|
obj = obj.to_dataset() # type: ignore[assignment]
|
|
|
|
expected = obj.groupby(labels1=UniqueGrouper(), labels2=UniqueGrouper()).mean()
|
|
actual = obj.groupby(("labels1", "labels2")).mean()
|
|
assert_identical(expected, actual)
|
|
|
|
# Passes `"labels2"` to squeeze; will raise an error around kwargs rather than the
|
|
# warning & type error in the future
|
|
with pytest.warns(FutureWarning):
|
|
with pytest.raises(TypeError):
|
|
obj.groupby("labels1", "labels2") # type: ignore[arg-type, misc]
|
|
with pytest.raises(ValueError):
|
|
obj.groupby("labels1", foo="bar") # type: ignore[arg-type]
|
|
with pytest.raises(ValueError):
|
|
obj.groupby("labels1", foo=UniqueGrouper())
|
|
|
|
|
|
@pytest.mark.parametrize("shuffle", [True, False])
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
def test_multiple_groupers(use_flox: bool, shuffle: bool) -> None:
|
|
da = DataArray(
|
|
np.array([1, 2, 3, 0, 2, np.nan]),
|
|
dims="d",
|
|
coords=dict(
|
|
labels1=("d", np.array(["a", "b", "c", "c", "b", "a"])),
|
|
labels2=("d", np.array(["x", "y", "z", "z", "y", "x"])),
|
|
),
|
|
name="foo",
|
|
)
|
|
|
|
groupers: dict[str, Grouper]
|
|
groupers = dict(labels1=UniqueGrouper(), labels2=UniqueGrouper())
|
|
gb = da.groupby(groupers)
|
|
if shuffle:
|
|
gb = gb.shuffle_to_chunks().groupby(groupers)
|
|
repr(gb)
|
|
|
|
expected = DataArray(
|
|
np.array([[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 1.5]]),
|
|
dims=("labels1", "labels2"),
|
|
coords={
|
|
"labels1": np.array(["a", "b", "c"], dtype=object),
|
|
"labels2": np.array(["x", "y", "z"], dtype=object),
|
|
},
|
|
name="foo",
|
|
)
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.mean()
|
|
assert_identical(actual, expected)
|
|
|
|
# -------
|
|
coords = {"a": ("x", [0, 0, 1, 1]), "b": ("y", [0, 0, 1, 1])}
|
|
square = DataArray(np.arange(16).reshape(4, 4), coords=coords, dims=["x", "y"])
|
|
groupers = dict(a=UniqueGrouper(), b=UniqueGrouper())
|
|
gb = square.groupby(groupers)
|
|
if shuffle:
|
|
gb = gb.shuffle_to_chunks().groupby(groupers)
|
|
repr(gb)
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.mean()
|
|
expected = DataArray(
|
|
np.array([[2.5, 4.5], [10.5, 12.5]]),
|
|
dims=("a", "b"),
|
|
coords={"a": [0, 1], "b": [0, 1]},
|
|
)
|
|
assert_identical(actual, expected)
|
|
|
|
expected = square.astype(np.float64)
|
|
expected["a"], expected["b"] = broadcast(square.a, square.b)
|
|
with xr.set_options(use_flox=use_flox):
|
|
assert_identical(
|
|
square.groupby(x=UniqueGrouper(), y=UniqueGrouper()).mean(), expected
|
|
)
|
|
|
|
b = xr.DataArray(
|
|
np.random.default_rng(0).random((2, 3, 4)),
|
|
coords={"xy": (("x", "y"), [["a", "b", "c"], ["b", "c", "c"]], {"foo": "bar"})},
|
|
dims=["x", "y", "z"],
|
|
)
|
|
groupers = dict(x=UniqueGrouper(), y=UniqueGrouper())
|
|
gb = b.groupby(groupers)
|
|
if shuffle:
|
|
gb = gb.shuffle_to_chunks().groupby(groupers)
|
|
repr(gb)
|
|
with xr.set_options(use_flox=use_flox):
|
|
assert_identical(gb.mean("z"), b.mean("z"))
|
|
|
|
groupers = dict(x=UniqueGrouper(), xy=UniqueGrouper())
|
|
gb = b.groupby(groupers)
|
|
if shuffle:
|
|
gb = gb.shuffle_to_chunks().groupby(groupers)
|
|
repr(gb)
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.mean()
|
|
expected = b.drop_vars("xy").rename({"y": "xy"}).copy(deep=True)
|
|
newval = b.isel(x=1, y=slice(1, None)).mean("y").data
|
|
expected.loc[dict(x=1, xy=1)] = expected.sel(x=1, xy=0).data
|
|
expected.loc[dict(x=1, xy=0)] = np.nan
|
|
expected.loc[dict(x=1, xy=2)] = newval
|
|
expected["xy"] = ("xy", ["a", "b", "c"], {"foo": "bar"})
|
|
# TODO: is order of dims correct?
|
|
assert_identical(actual, expected.transpose("z", "x", "xy"))
|
|
|
|
if has_dask:
|
|
b["xy"] = b["xy"].chunk()
|
|
for eagerly_compute_group in [True, False]:
|
|
kwargs = dict(
|
|
x=UniqueGrouper(),
|
|
xy=UniqueGrouper(labels=["a", "b", "c"]),
|
|
eagerly_compute_group=eagerly_compute_group,
|
|
)
|
|
expected = xr.DataArray(
|
|
[[[1, 1, 1], [np.nan, 1, 2]]] * 4,
|
|
dims=("z", "x", "xy"),
|
|
coords={"xy": ("xy", ["a", "b", "c"], {"foo": "bar"})},
|
|
)
|
|
if eagerly_compute_group:
|
|
with raise_if_dask_computes(max_computes=1):
|
|
with pytest.warns(DeprecationWarning):
|
|
gb = b.groupby(**kwargs) # type: ignore[arg-type]
|
|
assert_identical(gb.count(), expected)
|
|
else:
|
|
with raise_if_dask_computes(max_computes=0):
|
|
gb = b.groupby(**kwargs) # type: ignore[arg-type]
|
|
assert is_chunked_array(gb.encoded.codes.data)
|
|
assert not gb.encoded.group_indices
|
|
if has_flox:
|
|
with raise_if_dask_computes(max_computes=1):
|
|
assert_identical(gb.count(), expected)
|
|
else:
|
|
with pytest.raises(ValueError, match="when lazily grouping"):
|
|
gb.count()
|
|
|
|
|
|
@pytest.mark.parametrize("use_flox", [True, False])
|
|
@pytest.mark.parametrize("shuffle", [True, False])
|
|
def test_multiple_groupers_mixed(use_flox: bool, shuffle: bool) -> None:
|
|
# This groupby has missing groups
|
|
ds = xr.Dataset(
|
|
{"foo": (("x", "y"), np.arange(12).reshape((4, 3)))},
|
|
coords={"x": [10, 20, 30, 40], "letters": ("x", list("abba"))},
|
|
)
|
|
groupers: dict[str, Grouper] = dict(
|
|
x=BinGrouper(bins=[5, 15, 25]), letters=UniqueGrouper()
|
|
)
|
|
gb = ds.groupby(groupers)
|
|
if shuffle:
|
|
gb = gb.shuffle_to_chunks().groupby(groupers)
|
|
expected_data = np.array(
|
|
[
|
|
[[0.0, np.nan], [np.nan, 3.0]],
|
|
[[1.0, np.nan], [np.nan, 4.0]],
|
|
[[2.0, np.nan], [np.nan, 5.0]],
|
|
]
|
|
)
|
|
expected = xr.Dataset(
|
|
{"foo": (("y", "x_bins", "letters"), expected_data)},
|
|
coords={
|
|
"x_bins": (
|
|
"x_bins",
|
|
np.array(
|
|
[
|
|
pd.Interval(5, 15, closed="right"),
|
|
pd.Interval(15, 25, closed="right"),
|
|
],
|
|
dtype=object,
|
|
),
|
|
),
|
|
"letters": ("letters", np.array(["a", "b"], dtype=object)),
|
|
},
|
|
)
|
|
with xr.set_options(use_flox=use_flox):
|
|
actual = gb.sum()
|
|
assert_identical(actual, expected)
|
|
|
|
# assert_identical(
|
|
# b.groupby(['x', 'y']).apply(lambda x: x - x.mean()),
|
|
# b - b.mean("z"),
|
|
# )
|
|
|
|
# gb = square.groupby(x=UniqueGrouper(), y=UniqueGrouper())
|
|
# gb - gb.mean()
|
|
|
|
# ------
|
|
|
|
|
|
@requires_flox_0_9_12
|
|
@pytest.mark.parametrize(
|
|
"reduction", ["max", "min", "nanmax", "nanmin", "sum", "nansum", "prod", "nanprod"]
|
|
)
|
|
def test_groupby_preserve_dtype(reduction):
|
|
# all groups are present, we should follow numpy exactly
|
|
ds = xr.Dataset(
|
|
{
|
|
"test": (
|
|
["x", "y"],
|
|
np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int16"),
|
|
)
|
|
},
|
|
coords={"idx": ("x", [1, 2, 1])},
|
|
)
|
|
|
|
kwargs = {}
|
|
if "nan" in reduction:
|
|
kwargs["skipna"] = True
|
|
# TODO: fix dtype with numbagg/bottleneck and use_flox=False
|
|
with xr.set_options(use_numbagg=False, use_bottleneck=False):
|
|
actual = getattr(ds.groupby("idx"), reduction.removeprefix("nan"))(
|
|
**kwargs
|
|
).test.dtype
|
|
expected = getattr(np, reduction)(ds.test.data, axis=0).dtype
|
|
|
|
assert actual == expected
|
|
|
|
|
|
@requires_dask
|
|
@requires_flox_0_9_12
|
|
@pytest.mark.parametrize("reduction", ["any", "all", "count"])
|
|
def test_gappy_resample_reductions(reduction):
|
|
# GH8090
|
|
dates = (("1988-12-01", "1990-11-30"), ("2000-12-01", "2001-11-30"))
|
|
times = [xr.date_range(*d, freq="D") for d in dates]
|
|
|
|
da = xr.concat(
|
|
[
|
|
xr.DataArray(np.random.rand(len(t)), coords={"time": t}, dims="time")
|
|
for t in times
|
|
],
|
|
dim="time",
|
|
).chunk(time=100)
|
|
|
|
rs = (da > 0.5).resample(time="YS-DEC")
|
|
method = getattr(rs, reduction)
|
|
with xr.set_options(use_flox=True):
|
|
actual = method(dim="time")
|
|
with xr.set_options(use_flox=False):
|
|
expected = method(dim="time")
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_groupby_transpose():
|
|
# GH5361
|
|
data = xr.DataArray(
|
|
np.random.randn(4, 2),
|
|
dims=["x", "z"],
|
|
coords={"x": ["a", "b", "a", "c"], "y": ("x", [0, 1, 0, 2])},
|
|
)
|
|
first = data.T.groupby("x").sum()
|
|
second = data.groupby("x").sum()
|
|
|
|
assert_identical(first, second.transpose(*first.dims))
|
|
|
|
|
|
@requires_dask
|
|
@pytest.mark.parametrize(
|
|
"grouper, expect_index",
|
|
[
|
|
[UniqueGrouper(labels=np.arange(1, 5)), pd.Index(np.arange(1, 5))],
|
|
[UniqueGrouper(labels=np.arange(1, 5)[::-1]), pd.Index(np.arange(1, 5)[::-1])],
|
|
[
|
|
BinGrouper(bins=np.arange(1, 5)),
|
|
pd.IntervalIndex.from_breaks(np.arange(1, 5)),
|
|
],
|
|
],
|
|
)
|
|
def test_lazy_grouping(grouper, expect_index):
|
|
import dask.array
|
|
|
|
data = DataArray(
|
|
dims=("x", "y"),
|
|
data=dask.array.arange(20, chunks=3).reshape((4, 5)),
|
|
name="zoo",
|
|
)
|
|
with raise_if_dask_computes():
|
|
encoded = grouper.factorize(data)
|
|
assert encoded.codes.ndim == data.ndim
|
|
pd.testing.assert_index_equal(encoded.full_index, expect_index)
|
|
np.testing.assert_array_equal(encoded.unique_coord.values, np.array(expect_index))
|
|
|
|
eager = (
|
|
xr.Dataset({"foo": data}, coords={"zoo": data.compute()})
|
|
.groupby(zoo=grouper)
|
|
.count()
|
|
)
|
|
expected = Dataset(
|
|
{"foo": (encoded.codes.name, np.ones(encoded.full_index.size))},
|
|
coords={encoded.codes.name: expect_index},
|
|
)
|
|
assert_identical(eager, expected)
|
|
|
|
if has_flox:
|
|
lazy = (
|
|
xr.Dataset({"foo": data}, coords={"zoo": data})
|
|
.groupby(zoo=grouper, eagerly_compute_group=False)
|
|
.count()
|
|
)
|
|
assert_identical(eager, lazy)
|
|
|
|
|
|
@requires_dask
|
|
def test_lazy_grouping_errors():
|
|
import dask.array
|
|
|
|
data = DataArray(
|
|
dims=("x",),
|
|
data=dask.array.arange(20, chunks=3),
|
|
name="foo",
|
|
coords={"y": ("x", dask.array.arange(20, chunks=3))},
|
|
)
|
|
|
|
gb = data.groupby(
|
|
y=UniqueGrouper(labels=np.arange(5, 10)), eagerly_compute_group=False
|
|
)
|
|
message = "not supported when lazily grouping by"
|
|
with pytest.raises(ValueError, match=message):
|
|
gb.map(lambda x: x)
|
|
|
|
with pytest.raises(ValueError, match=message):
|
|
gb.reduce(np.mean)
|
|
|
|
with pytest.raises(ValueError, match=message):
|
|
for _, _ in gb:
|
|
pass
|
|
|
|
|
|
@requires_dask
|
|
def test_lazy_int_bins_error():
|
|
import dask.array
|
|
|
|
with pytest.raises(ValueError, match="Bin edges must be provided"):
|
|
with raise_if_dask_computes():
|
|
_ = BinGrouper(bins=4).factorize(DataArray(dask.array.arange(3)))
|
|
|
|
|
|
def test_time_grouping_seasons_specified():
|
|
time = xr.date_range("2001-01-01", "2002-01-01", freq="D")
|
|
ds = xr.Dataset({"foo": np.arange(time.size)}, coords={"time": ("time", time)})
|
|
labels = ["DJF", "MAM", "JJA", "SON"]
|
|
actual = ds.groupby({"time.season": UniqueGrouper(labels=labels)}).sum()
|
|
expected = ds.groupby("time.season").sum()
|
|
assert_identical(actual, expected.reindex(season=labels))
|
|
|
|
|
|
def test_groupby_multiple_bin_grouper_missing_groups():
|
|
from numpy import nan
|
|
|
|
ds = xr.Dataset(
|
|
{"foo": (("z"), np.arange(12))},
|
|
coords={"x": ("z", np.arange(12)), "y": ("z", np.arange(12))},
|
|
)
|
|
|
|
actual = ds.groupby(
|
|
x=BinGrouper(np.arange(0, 13, 4)), y=BinGrouper(bins=np.arange(0, 16, 2))
|
|
).count()
|
|
expected = Dataset(
|
|
{
|
|
"foo": (
|
|
("x_bins", "y_bins"),
|
|
np.array(
|
|
[
|
|
[2.0, 2.0, nan, nan, nan, nan, nan],
|
|
[nan, nan, 2.0, 2.0, nan, nan, nan],
|
|
[nan, nan, nan, nan, 2.0, 1.0, nan],
|
|
]
|
|
),
|
|
)
|
|
},
|
|
coords={
|
|
"x_bins": ("x_bins", pd.IntervalIndex.from_breaks(np.arange(0, 13, 4))),
|
|
"y_bins": ("y_bins", pd.IntervalIndex.from_breaks(np.arange(0, 16, 2))),
|
|
},
|
|
)
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
@requires_dask_ge_2024_08_1
|
|
def test_shuffle_simple() -> None:
|
|
import dask
|
|
|
|
da = xr.DataArray(
|
|
dims="x",
|
|
data=dask.array.from_array([1, 2, 3, 4, 5, 6], chunks=2),
|
|
coords={"label": ("x", "a b c a b c".split(" "))},
|
|
)
|
|
actual = da.groupby(label=UniqueGrouper()).shuffle_to_chunks()
|
|
expected = da.isel(x=[0, 3, 1, 4, 2, 5])
|
|
assert_identical(actual, expected)
|
|
|
|
with pytest.raises(ValueError):
|
|
da.chunk(x=2, eagerly_load_group=False).groupby("label").shuffle_to_chunks()
|
|
|
|
|
|
@requires_dask_ge_2024_08_1
|
|
@pytest.mark.parametrize(
|
|
"chunks, expected_chunks",
|
|
[
|
|
((1,), (1, 3, 3, 3)),
|
|
((10,), (10,)),
|
|
],
|
|
)
|
|
def test_shuffle_by(chunks, expected_chunks):
|
|
import dask.array
|
|
|
|
from xarray.groupers import UniqueGrouper
|
|
|
|
da = xr.DataArray(
|
|
dims="x",
|
|
data=dask.array.arange(10, chunks=chunks),
|
|
coords={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]},
|
|
name="a",
|
|
)
|
|
ds = da.to_dataset()
|
|
|
|
for obj in [ds, da]:
|
|
actual = obj.groupby(x=UniqueGrouper()).shuffle_to_chunks()
|
|
assert_identical(actual, obj.sortby("x"))
|
|
assert actual.chunksizes["x"] == expected_chunks
|
|
|
|
|
|
@requires_dask
|
|
def test_groupby_dask_eager_load_warnings():
|
|
ds = xr.Dataset(
|
|
{"foo": (("z"), np.arange(12))},
|
|
coords={"x": ("z", np.arange(12)), "y": ("z", np.arange(12))},
|
|
).chunk(z=6)
|
|
|
|
with pytest.warns(DeprecationWarning):
|
|
ds.groupby(x=UniqueGrouper())
|
|
|
|
with pytest.warns(DeprecationWarning):
|
|
ds.groupby("x")
|
|
|
|
with pytest.warns(DeprecationWarning):
|
|
ds.groupby(ds.x)
|
|
|
|
with pytest.raises(ValueError, match="Please pass"):
|
|
ds.groupby("x", eagerly_compute_group=False)
|
|
|
|
# This is technically fine but anyone iterating over the groupby object
|
|
# will see an error, so let's warn and have them opt-in.
|
|
with pytest.warns(DeprecationWarning):
|
|
ds.groupby(x=UniqueGrouper(labels=[1, 2, 3]))
|
|
|
|
ds.groupby(x=UniqueGrouper(labels=[1, 2, 3]), eagerly_compute_group=False)
|
|
|
|
with pytest.warns(DeprecationWarning):
|
|
ds.groupby_bins("x", bins=3)
|
|
with pytest.raises(ValueError, match="Please pass"):
|
|
ds.groupby_bins("x", bins=3, eagerly_compute_group=False)
|
|
with pytest.warns(DeprecationWarning):
|
|
ds.groupby_bins("x", bins=[1, 2, 3])
|
|
ds.groupby_bins("x", bins=[1, 2, 3], eagerly_compute_group=False)
|
|
|
|
|
|
# TODO: Possible property tests to add to this module
|
|
# 1. lambda x: x
|
|
# 2. grouped-reduce on unique coords is identical to array
|
|
# 3. group_over == groupby-reduce along other dimensions
|
|
# 4. result is equivalent for transposed input
|