1831 lines
63 KiB
Python
1831 lines
63 KiB
Python
from __future__ import annotations
|
|
|
|
import operator
|
|
import pickle
|
|
import sys
|
|
from contextlib import suppress
|
|
from textwrap import dedent
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
import xarray as xr
|
|
import xarray.ufuncs as xu
|
|
from xarray import DataArray, Dataset, Variable
|
|
from xarray.core import duck_array_ops
|
|
from xarray.core.duck_array_ops import lazy_array_equiv
|
|
from xarray.core.indexes import PandasIndex
|
|
from xarray.testing import assert_chunks_equal
|
|
from xarray.tests import (
|
|
assert_allclose,
|
|
assert_array_equal,
|
|
assert_equal,
|
|
assert_frame_equal,
|
|
assert_identical,
|
|
mock,
|
|
raise_if_dask_computes,
|
|
requires_pint,
|
|
requires_scipy_or_netCDF4,
|
|
)
|
|
from xarray.tests.test_backends import create_tmp_file
|
|
|
|
dask = pytest.importorskip("dask")
|
|
da = pytest.importorskip("dask.array")
|
|
dd = pytest.importorskip("dask.dataframe")
|
|
|
|
ON_WINDOWS = sys.platform == "win32"
|
|
|
|
|
|
def test_raise_if_dask_computes():
|
|
data = da.from_array(np.random.default_rng(0).random((4, 6)), chunks=(2, 2))
|
|
with pytest.raises(RuntimeError, match=r"Too many computes"):
|
|
with raise_if_dask_computes():
|
|
data.compute()
|
|
|
|
|
|
class DaskTestCase:
|
|
def assertLazyAnd(self, expected, actual, test):
|
|
with dask.config.set(scheduler="synchronous"):
|
|
test(actual, expected)
|
|
|
|
if isinstance(actual, Dataset):
|
|
for k, v in actual.variables.items():
|
|
if k in actual.xindexes:
|
|
assert isinstance(v.data, np.ndarray)
|
|
else:
|
|
assert isinstance(v.data, da.Array)
|
|
elif isinstance(actual, DataArray):
|
|
assert isinstance(actual.data, da.Array)
|
|
for k, v in actual.coords.items():
|
|
if k in actual.xindexes:
|
|
assert isinstance(v.data, np.ndarray)
|
|
else:
|
|
assert isinstance(v.data, da.Array)
|
|
elif isinstance(actual, Variable):
|
|
assert isinstance(actual.data, da.Array)
|
|
else:
|
|
raise AssertionError()
|
|
|
|
|
|
class TestVariable(DaskTestCase):
|
|
def assertLazyAndIdentical(self, expected, actual):
|
|
self.assertLazyAnd(expected, actual, assert_identical)
|
|
|
|
def assertLazyAndAllClose(self, expected, actual):
|
|
self.assertLazyAnd(expected, actual, assert_allclose)
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setUp(self):
|
|
self.values = np.random.default_rng(0).random((4, 6))
|
|
self.data = da.from_array(self.values, chunks=(2, 2))
|
|
|
|
self.eager_var = Variable(("x", "y"), self.values)
|
|
self.lazy_var = Variable(("x", "y"), self.data)
|
|
|
|
def test_basics(self):
|
|
v = self.lazy_var
|
|
assert self.data is v.data
|
|
assert self.data.chunks == v.chunks
|
|
assert_array_equal(self.values, v)
|
|
|
|
def test_copy(self):
|
|
self.assertLazyAndIdentical(self.eager_var, self.lazy_var.copy())
|
|
self.assertLazyAndIdentical(self.eager_var, self.lazy_var.copy(deep=True))
|
|
|
|
def test_chunk(self):
|
|
for chunks, expected in [
|
|
({}, ((2, 2), (2, 2, 2))),
|
|
(3, ((3, 1), (3, 3))),
|
|
({"x": 3, "y": 3}, ((3, 1), (3, 3))),
|
|
({"x": 3}, ((3, 1), (2, 2, 2))),
|
|
({"x": (3, 1)}, ((3, 1), (2, 2, 2))),
|
|
]:
|
|
rechunked = self.lazy_var.chunk(chunks)
|
|
assert rechunked.chunks == expected
|
|
self.assertLazyAndIdentical(self.eager_var, rechunked)
|
|
|
|
expected_chunksizes = dict(zip(self.lazy_var.dims, expected, strict=True))
|
|
assert rechunked.chunksizes == expected_chunksizes
|
|
|
|
def test_indexing(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(u[0], v[0])
|
|
self.assertLazyAndIdentical(u[:1], v[:1])
|
|
self.assertLazyAndIdentical(u[[0, 1], [0, 1, 2]], v[[0, 1], [0, 1, 2]])
|
|
|
|
@pytest.mark.parametrize(
|
|
"expected_data, index",
|
|
[
|
|
(da.array([99, 2, 3, 4]), 0),
|
|
(da.array([99, 99, 99, 4]), slice(2, None, -1)),
|
|
(da.array([99, 99, 3, 99]), [0, -1, 1]),
|
|
(da.array([99, 99, 99, 4]), np.arange(3)),
|
|
(da.array([1, 99, 99, 99]), [False, True, True, True]),
|
|
(da.array([1, 99, 99, 99]), np.array([False, True, True, True])),
|
|
(da.array([99, 99, 99, 99]), Variable(("x"), np.array([True] * 4))),
|
|
],
|
|
)
|
|
def test_setitem_dask_array(self, expected_data, index):
|
|
arr = Variable(("x"), da.array([1, 2, 3, 4]))
|
|
expected = Variable(("x"), expected_data)
|
|
with raise_if_dask_computes():
|
|
arr[index] = 99
|
|
assert_identical(arr, expected)
|
|
|
|
def test_squeeze(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(u[0].squeeze(), v[0].squeeze())
|
|
|
|
def test_equals(self):
|
|
v = self.lazy_var
|
|
assert v.equals(v)
|
|
assert isinstance(v.data, da.Array)
|
|
assert v.identical(v)
|
|
assert isinstance(v.data, da.Array)
|
|
|
|
def test_transpose(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(u.T, v.T)
|
|
|
|
def test_shift(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(u.shift(x=2), v.shift(x=2))
|
|
self.assertLazyAndIdentical(u.shift(x=-2), v.shift(x=-2))
|
|
assert v.data.chunks == v.shift(x=1).data.chunks
|
|
|
|
def test_roll(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(u.roll(x=2), v.roll(x=2))
|
|
assert v.data.chunks == v.roll(x=1).data.chunks
|
|
|
|
def test_unary_op(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(-u, -v)
|
|
self.assertLazyAndIdentical(abs(u), abs(v))
|
|
self.assertLazyAndIdentical(u.round(), v.round())
|
|
|
|
def test_binary_op(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(2 * u, 2 * v)
|
|
self.assertLazyAndIdentical(u + u, v + v)
|
|
self.assertLazyAndIdentical(u[0] + u, v[0] + v)
|
|
|
|
def test_binary_op_bitshift(self) -> None:
|
|
# bit shifts only work on ints so we need to generate
|
|
# new eager and lazy vars
|
|
rng = np.random.default_rng(0)
|
|
values = rng.integers(low=-10000, high=10000, size=(4, 6))
|
|
data = da.from_array(values, chunks=(2, 2))
|
|
u = Variable(("x", "y"), values)
|
|
v = Variable(("x", "y"), data)
|
|
self.assertLazyAndIdentical(u << 2, v << 2)
|
|
self.assertLazyAndIdentical(u << 5, v << 5)
|
|
self.assertLazyAndIdentical(u >> 2, v >> 2)
|
|
self.assertLazyAndIdentical(u >> 5, v >> 5)
|
|
|
|
def test_repr(self):
|
|
expected = dedent(
|
|
f"""\
|
|
<xarray.Variable (x: 4, y: 6)> Size: 192B
|
|
{self.lazy_var.data!r}"""
|
|
)
|
|
assert expected == repr(self.lazy_var)
|
|
|
|
def test_pickle(self):
|
|
# Test that pickling/unpickling does not convert the dask
|
|
# backend to numpy
|
|
a1 = Variable(["x"], build_dask_array("x"))
|
|
a1.compute()
|
|
assert not a1._in_memory
|
|
assert kernel_call_count == 1
|
|
a2 = pickle.loads(pickle.dumps(a1))
|
|
assert kernel_call_count == 1
|
|
assert_identical(a1, a2)
|
|
assert not a1._in_memory
|
|
assert not a2._in_memory
|
|
|
|
def test_reduce(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndAllClose(u.mean(), v.mean())
|
|
self.assertLazyAndAllClose(u.std(), v.std())
|
|
with raise_if_dask_computes():
|
|
actual = v.argmax(dim="x")
|
|
self.assertLazyAndAllClose(u.argmax(dim="x"), actual)
|
|
with raise_if_dask_computes():
|
|
actual = v.argmin(dim="x")
|
|
self.assertLazyAndAllClose(u.argmin(dim="x"), actual)
|
|
self.assertLazyAndAllClose((u > 1).any(), (v > 1).any())
|
|
self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x"))
|
|
with pytest.raises(NotImplementedError, match=r"only works along an axis"):
|
|
v.median()
|
|
with pytest.raises(NotImplementedError, match=r"only works along an axis"):
|
|
v.median(v.dims)
|
|
with raise_if_dask_computes():
|
|
v.reduce(duck_array_ops.mean)
|
|
|
|
def test_missing_values(self):
|
|
values = np.array([0, 1, np.nan, 3])
|
|
data = da.from_array(values, chunks=(2,))
|
|
|
|
eager_var = Variable("x", values)
|
|
lazy_var = Variable("x", data)
|
|
self.assertLazyAndIdentical(eager_var, lazy_var.fillna(lazy_var))
|
|
self.assertLazyAndIdentical(Variable("x", range(4)), lazy_var.fillna(2))
|
|
self.assertLazyAndIdentical(eager_var.count(), lazy_var.count())
|
|
|
|
def test_concat(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndIdentical(u, Variable.concat([v[:2], v[2:]], "x"))
|
|
self.assertLazyAndIdentical(u[:2], Variable.concat([v[0], v[1]], "x"))
|
|
self.assertLazyAndIdentical(u[:2], Variable.concat([u[0], v[1]], "x"))
|
|
self.assertLazyAndIdentical(u[:2], Variable.concat([v[0], u[1]], "x"))
|
|
self.assertLazyAndIdentical(
|
|
u[:3], Variable.concat([v[[0, 2]], v[[1]]], "x", positions=[[0, 2], [1]])
|
|
)
|
|
|
|
def test_missing_methods(self):
|
|
v = self.lazy_var
|
|
try:
|
|
v.argsort()
|
|
except NotImplementedError as err:
|
|
assert "dask" in str(err)
|
|
try:
|
|
v[0].item()
|
|
except NotImplementedError as err:
|
|
assert "dask" in str(err)
|
|
|
|
def test_univariate_ufunc(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndAllClose(np.sin(u), np.sin(v))
|
|
|
|
def test_bivariate_ufunc(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndAllClose(np.maximum(u, 0), np.maximum(v, 0))
|
|
self.assertLazyAndAllClose(np.maximum(u, 0), np.maximum(0, v))
|
|
|
|
def test_univariate_xufunc(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndAllClose(np.sin(u), xu.sin(v))
|
|
|
|
def test_bivariate_xufunc(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
self.assertLazyAndAllClose(np.maximum(u, 0), xu.maximum(v, 0))
|
|
self.assertLazyAndAllClose(np.maximum(u, 0), xu.maximum(0, v))
|
|
|
|
def test_compute(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var
|
|
|
|
assert dask.is_dask_collection(v)
|
|
(v2,) = dask.compute(v + 1)
|
|
assert not dask.is_dask_collection(v2)
|
|
|
|
assert ((u + 1).data == v2.data).all()
|
|
|
|
def test_persist(self):
|
|
u = self.eager_var
|
|
v = self.lazy_var + 1
|
|
|
|
(v2,) = dask.persist(v)
|
|
assert v is not v2
|
|
assert len(v2.__dask_graph__()) < len(v.__dask_graph__())
|
|
assert v2.__dask_keys__() == v.__dask_keys__()
|
|
assert dask.is_dask_collection(v)
|
|
assert dask.is_dask_collection(v2)
|
|
|
|
self.assertLazyAndAllClose(u + 1, v)
|
|
self.assertLazyAndAllClose(u + 1, v2)
|
|
|
|
@requires_pint
|
|
def test_tokenize_duck_dask_array(self):
|
|
import pint
|
|
|
|
unit_registry = pint.UnitRegistry()
|
|
|
|
q = unit_registry.Quantity(self.data, "meter")
|
|
variable = xr.Variable(("x", "y"), q)
|
|
|
|
token = dask.base.tokenize(variable)
|
|
post_op = variable + 5 * unit_registry.meter
|
|
|
|
assert dask.base.tokenize(variable) != dask.base.tokenize(post_op)
|
|
# Immutability check
|
|
assert dask.base.tokenize(variable) == token
|
|
|
|
|
|
class TestDataArrayAndDataset(DaskTestCase):
|
|
def assertLazyAndIdentical(self, expected, actual):
|
|
self.assertLazyAnd(expected, actual, assert_identical)
|
|
|
|
def assertLazyAndAllClose(self, expected, actual):
|
|
self.assertLazyAnd(expected, actual, assert_allclose)
|
|
|
|
def assertLazyAndEqual(self, expected, actual):
|
|
self.assertLazyAnd(expected, actual, assert_equal)
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setUp(self):
|
|
self.values = np.random.randn(4, 6)
|
|
self.data = da.from_array(self.values, chunks=(2, 2))
|
|
self.eager_array = DataArray(
|
|
self.values, coords={"x": range(4)}, dims=("x", "y"), name="foo"
|
|
)
|
|
self.lazy_array = DataArray(
|
|
self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo"
|
|
)
|
|
|
|
def test_chunk(self) -> None:
|
|
for chunks, expected in [
|
|
({}, ((2, 2), (2, 2, 2))),
|
|
(3, ((3, 1), (3, 3))),
|
|
({"x": 3, "y": 3}, ((3, 1), (3, 3))),
|
|
({"x": 3}, ((3, 1), (2, 2, 2))),
|
|
({"x": (3, 1)}, ((3, 1), (2, 2, 2))),
|
|
({"x": "16B"}, ((1, 1, 1, 1), (2, 2, 2))),
|
|
("16B", ((1, 1, 1, 1), (1,) * 6)),
|
|
("16MB", ((4,), (6,))),
|
|
]:
|
|
# Test DataArray
|
|
rechunked = self.lazy_array.chunk(chunks)
|
|
assert rechunked.chunks == expected
|
|
self.assertLazyAndIdentical(self.eager_array, rechunked)
|
|
|
|
expected_chunksizes = dict(zip(self.lazy_array.dims, expected, strict=True))
|
|
assert rechunked.chunksizes == expected_chunksizes
|
|
|
|
# Test Dataset
|
|
lazy_dataset = self.lazy_array.to_dataset()
|
|
eager_dataset = self.eager_array.to_dataset()
|
|
expected_chunksizes = dict(zip(lazy_dataset.dims, expected, strict=True))
|
|
rechunked = lazy_dataset.chunk(chunks)
|
|
|
|
# Dataset.chunks has a different return type to DataArray.chunks - see issue #5843
|
|
assert rechunked.chunks == expected_chunksizes
|
|
self.assertLazyAndIdentical(eager_dataset, rechunked)
|
|
|
|
assert rechunked.chunksizes == expected_chunksizes
|
|
|
|
def test_rechunk(self):
|
|
chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2})
|
|
assert chunked.chunks == ((2,) * 2, (2,) * 3)
|
|
self.assertLazyAndIdentical(self.lazy_array, chunked)
|
|
|
|
def test_new_chunk(self):
|
|
chunked = self.eager_array.chunk()
|
|
assert chunked.data.name.startswith("xarray-<this-array>")
|
|
|
|
def test_lazy_dataset(self):
|
|
lazy_ds = Dataset({"foo": (("x", "y"), self.data)})
|
|
assert isinstance(lazy_ds.foo.variable.data, da.Array)
|
|
|
|
def test_lazy_array(self):
|
|
u = self.eager_array
|
|
v = self.lazy_array
|
|
|
|
self.assertLazyAndAllClose(u, v)
|
|
self.assertLazyAndAllClose(-u, -v)
|
|
self.assertLazyAndAllClose(u.T, v.T)
|
|
self.assertLazyAndAllClose(u.mean(), v.mean())
|
|
self.assertLazyAndAllClose(1 + u, 1 + v)
|
|
|
|
actual = xr.concat([v[:2], v[2:]], "x")
|
|
self.assertLazyAndAllClose(u, actual)
|
|
|
|
def test_compute(self):
|
|
u = self.eager_array
|
|
v = self.lazy_array
|
|
|
|
assert dask.is_dask_collection(v)
|
|
(v2,) = dask.compute(v + 1)
|
|
assert not dask.is_dask_collection(v2)
|
|
|
|
assert ((u + 1).data == v2.data).all()
|
|
|
|
def test_persist(self):
|
|
u = self.eager_array
|
|
v = self.lazy_array + 1
|
|
|
|
(v2,) = dask.persist(v)
|
|
assert v is not v2
|
|
assert len(v2.__dask_graph__()) < len(v.__dask_graph__())
|
|
assert v2.__dask_keys__() == v.__dask_keys__()
|
|
assert dask.is_dask_collection(v)
|
|
assert dask.is_dask_collection(v2)
|
|
|
|
self.assertLazyAndAllClose(u + 1, v)
|
|
self.assertLazyAndAllClose(u + 1, v2)
|
|
|
|
def test_concat_loads_variables(self):
|
|
# Test that concat() computes not-in-memory variables at most once
|
|
# and loads them in the output, while leaving the input unaltered.
|
|
d1 = build_dask_array("d1")
|
|
c1 = build_dask_array("c1")
|
|
d2 = build_dask_array("d2")
|
|
c2 = build_dask_array("c2")
|
|
d3 = build_dask_array("d3")
|
|
c3 = build_dask_array("c3")
|
|
# Note: c is a non-index coord.
|
|
# Index coords are loaded by IndexVariable.__init__.
|
|
ds1 = Dataset(data_vars={"d": ("x", d1)}, coords={"c": ("x", c1)})
|
|
ds2 = Dataset(data_vars={"d": ("x", d2)}, coords={"c": ("x", c2)})
|
|
ds3 = Dataset(data_vars={"d": ("x", d3)}, coords={"c": ("x", c3)})
|
|
|
|
assert kernel_call_count == 0
|
|
out = xr.concat(
|
|
[ds1, ds2, ds3], dim="n", data_vars="different", coords="different"
|
|
)
|
|
# each kernel is computed exactly once
|
|
assert kernel_call_count == 6
|
|
# variables are loaded in the output
|
|
assert isinstance(out["d"].data, np.ndarray)
|
|
assert isinstance(out["c"].data, np.ndarray)
|
|
|
|
out = xr.concat([ds1, ds2, ds3], dim="n", data_vars="all", coords="all")
|
|
# no extra kernel calls
|
|
assert kernel_call_count == 6
|
|
assert isinstance(out["d"].data, dask.array.Array)
|
|
assert isinstance(out["c"].data, dask.array.Array)
|
|
|
|
out = xr.concat([ds1, ds2, ds3], dim="n", data_vars=["d"], coords=["c"])
|
|
# no extra kernel calls
|
|
assert kernel_call_count == 6
|
|
assert isinstance(out["d"].data, dask.array.Array)
|
|
assert isinstance(out["c"].data, dask.array.Array)
|
|
|
|
out = xr.concat([ds1, ds2, ds3], dim="n", data_vars=[], coords=[])
|
|
# variables are loaded once as we are validating that they're identical
|
|
assert kernel_call_count == 12
|
|
assert isinstance(out["d"].data, np.ndarray)
|
|
assert isinstance(out["c"].data, np.ndarray)
|
|
|
|
out = xr.concat(
|
|
[ds1, ds2, ds3],
|
|
dim="n",
|
|
data_vars="different",
|
|
coords="different",
|
|
compat="identical",
|
|
)
|
|
# compat=identical doesn't do any more kernel calls than compat=equals
|
|
assert kernel_call_count == 18
|
|
assert isinstance(out["d"].data, np.ndarray)
|
|
assert isinstance(out["c"].data, np.ndarray)
|
|
|
|
# When the test for different turns true halfway through,
|
|
# stop computing variables as it would not have any benefit
|
|
ds4 = Dataset(data_vars={"d": ("x", [2.0])}, coords={"c": ("x", [2.0])})
|
|
out = xr.concat(
|
|
[ds1, ds2, ds4, ds3], dim="n", data_vars="different", coords="different"
|
|
)
|
|
# the variables of ds1 and ds2 were computed, but those of ds3 didn't
|
|
assert kernel_call_count == 22
|
|
assert isinstance(out["d"].data, dask.array.Array)
|
|
assert isinstance(out["c"].data, dask.array.Array)
|
|
# the data of ds1 and ds2 was loaded into numpy and then
|
|
# concatenated to the data of ds3. Thus, only ds3 is computed now.
|
|
out.compute()
|
|
assert kernel_call_count == 24
|
|
|
|
# Finally, test that originals are unaltered
|
|
assert ds1["d"].data is d1
|
|
assert ds1["c"].data is c1
|
|
assert ds2["d"].data is d2
|
|
assert ds2["c"].data is c2
|
|
assert ds3["d"].data is d3
|
|
assert ds3["c"].data is c3
|
|
|
|
# now check that concat() is correctly using dask name equality to skip loads
|
|
out = xr.concat(
|
|
[ds1, ds1, ds1], dim="n", data_vars="different", coords="different"
|
|
)
|
|
assert kernel_call_count == 24
|
|
# variables are not loaded in the output
|
|
assert isinstance(out["d"].data, dask.array.Array)
|
|
assert isinstance(out["c"].data, dask.array.Array)
|
|
|
|
out = xr.concat(
|
|
[ds1, ds1, ds1], dim="n", data_vars=[], coords=[], compat="identical"
|
|
)
|
|
assert kernel_call_count == 24
|
|
# variables are not loaded in the output
|
|
assert isinstance(out["d"].data, dask.array.Array)
|
|
assert isinstance(out["c"].data, dask.array.Array)
|
|
|
|
out = xr.concat(
|
|
[ds1, ds2.compute(), ds3],
|
|
dim="n",
|
|
data_vars="all",
|
|
coords="different",
|
|
compat="identical",
|
|
)
|
|
# c1,c3 must be computed for comparison since c2 is numpy;
|
|
# d2 is computed too
|
|
assert kernel_call_count == 28
|
|
|
|
out = xr.concat(
|
|
[ds1, ds2.compute(), ds3],
|
|
dim="n",
|
|
data_vars="all",
|
|
coords="all",
|
|
compat="identical",
|
|
)
|
|
# no extra computes
|
|
assert kernel_call_count == 30
|
|
|
|
# Finally, test that originals are unaltered
|
|
assert ds1["d"].data is d1
|
|
assert ds1["c"].data is c1
|
|
assert ds2["d"].data is d2
|
|
assert ds2["c"].data is c2
|
|
assert ds3["d"].data is d3
|
|
assert ds3["c"].data is c3
|
|
|
|
def test_groupby(self):
|
|
u = self.eager_array
|
|
v = self.lazy_array
|
|
|
|
expected = u.groupby("x").mean(...)
|
|
with raise_if_dask_computes():
|
|
actual = v.groupby("x").mean(...)
|
|
self.assertLazyAndAllClose(expected, actual)
|
|
|
|
def test_rolling(self):
|
|
u = self.eager_array
|
|
v = self.lazy_array
|
|
|
|
expected = u.rolling(x=2).mean()
|
|
with raise_if_dask_computes():
|
|
actual = v.rolling(x=2).mean()
|
|
self.assertLazyAndAllClose(expected, actual)
|
|
|
|
@pytest.mark.parametrize("func", ["first", "last"])
|
|
def test_groupby_first_last(self, func):
|
|
method = operator.methodcaller(func)
|
|
u = self.eager_array
|
|
v = self.lazy_array
|
|
|
|
for coords in [u.coords, v.coords]:
|
|
coords["ab"] = ("x", ["a", "a", "b", "b"])
|
|
expected = method(u.groupby("ab"))
|
|
|
|
with raise_if_dask_computes():
|
|
actual = method(v.groupby("ab"))
|
|
self.assertLazyAndAllClose(expected, actual)
|
|
|
|
with raise_if_dask_computes():
|
|
actual = method(v.groupby("ab"))
|
|
self.assertLazyAndAllClose(expected, actual)
|
|
|
|
def test_reindex(self):
|
|
u = self.eager_array.assign_coords(y=range(6))
|
|
v = self.lazy_array.assign_coords(y=range(6))
|
|
|
|
for kwargs in [
|
|
{"x": [2, 3, 4]},
|
|
{"x": [1, 100, 2, 101, 3]},
|
|
{"x": [2.5, 3, 3.5], "y": [2, 2.5, 3]},
|
|
]:
|
|
expected = u.reindex(**kwargs)
|
|
actual = v.reindex(**kwargs)
|
|
self.assertLazyAndAllClose(expected, actual)
|
|
|
|
def test_to_dataset_roundtrip(self):
|
|
u = self.eager_array
|
|
v = self.lazy_array
|
|
|
|
expected = u.assign_coords(x=u["x"])
|
|
self.assertLazyAndEqual(expected, v.to_dataset("x").to_dataarray("x"))
|
|
|
|
def test_merge(self):
|
|
def duplicate_and_merge(array):
|
|
return xr.merge([array, array.rename("bar")]).to_dataarray()
|
|
|
|
expected = duplicate_and_merge(self.eager_array)
|
|
actual = duplicate_and_merge(self.lazy_array)
|
|
self.assertLazyAndEqual(expected, actual)
|
|
|
|
def test_ufuncs(self):
|
|
u = self.eager_array
|
|
v = self.lazy_array
|
|
self.assertLazyAndAllClose(np.sin(u), np.sin(v))
|
|
|
|
def test_where_dispatching(self):
|
|
a = np.arange(10)
|
|
b = a > 3
|
|
x = da.from_array(a, 5)
|
|
y = da.from_array(b, 5)
|
|
expected = DataArray(a).where(b)
|
|
self.assertLazyAndEqual(expected, DataArray(a).where(y))
|
|
self.assertLazyAndEqual(expected, DataArray(x).where(b))
|
|
self.assertLazyAndEqual(expected, DataArray(x).where(y))
|
|
|
|
def test_simultaneous_compute(self):
|
|
ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()
|
|
|
|
count = [0]
|
|
|
|
def counting_get(*args, **kwargs):
|
|
count[0] += 1
|
|
return dask.get(*args, **kwargs)
|
|
|
|
ds.load(scheduler=counting_get)
|
|
|
|
assert count[0] == 1
|
|
|
|
def test_duplicate_dims(self):
|
|
data = np.random.normal(size=(4, 4))
|
|
with pytest.warns(UserWarning, match="Duplicate dimension"):
|
|
arr = DataArray(data, dims=("x", "x"))
|
|
with pytest.warns(UserWarning, match="Duplicate dimension"):
|
|
chunked_array = arr.chunk({"x": 2})
|
|
assert chunked_array.chunks == ((2, 2), (2, 2))
|
|
assert chunked_array.chunksizes == {"x": (2, 2)}
|
|
|
|
def test_stack(self):
|
|
data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4))
|
|
arr = DataArray(data, dims=("w", "x", "y"))
|
|
stacked = arr.stack(z=("x", "y"))
|
|
z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=["x", "y"])
|
|
expected = DataArray(data.reshape(2, -1), {"z": z}, dims=["w", "z"])
|
|
assert stacked.data.chunks == expected.data.chunks
|
|
self.assertLazyAndEqual(expected, stacked)
|
|
|
|
def test_dot(self):
|
|
eager = self.eager_array.dot(self.eager_array[0])
|
|
lazy = self.lazy_array.dot(self.lazy_array[0])
|
|
self.assertLazyAndAllClose(eager, lazy)
|
|
|
|
def test_dataarray_repr(self):
|
|
data = build_dask_array("data")
|
|
nonindex_coord = build_dask_array("coord")
|
|
a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
|
|
expected = dedent(
|
|
f"""\
|
|
<xarray.DataArray 'data' (x: 1)> Size: 8B
|
|
{data!r}
|
|
Coordinates:
|
|
y (x) int64 8B dask.array<chunksize=(1,), meta=np.ndarray>
|
|
Dimensions without coordinates: x"""
|
|
)
|
|
assert expected == repr(a)
|
|
assert kernel_call_count == 0 # should not evaluate dask array
|
|
|
|
def test_dataset_repr(self):
|
|
data = build_dask_array("data")
|
|
nonindex_coord = build_dask_array("coord")
|
|
ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)})
|
|
expected = dedent(
|
|
"""\
|
|
<xarray.Dataset> Size: 16B
|
|
Dimensions: (x: 1)
|
|
Coordinates:
|
|
y (x) int64 8B dask.array<chunksize=(1,), meta=np.ndarray>
|
|
Dimensions without coordinates: x
|
|
Data variables:
|
|
a (x) int64 8B dask.array<chunksize=(1,), meta=np.ndarray>"""
|
|
)
|
|
assert expected == repr(ds)
|
|
assert kernel_call_count == 0 # should not evaluate dask array
|
|
|
|
def test_dataarray_pickle(self):
|
|
# Test that pickling/unpickling converts the dask backend
|
|
# to numpy in neither the data variable nor the non-index coords
|
|
data = build_dask_array("data")
|
|
nonindex_coord = build_dask_array("coord")
|
|
a1 = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
|
|
a1.compute()
|
|
assert not a1._in_memory
|
|
assert not a1.coords["y"]._in_memory
|
|
assert kernel_call_count == 2
|
|
a2 = pickle.loads(pickle.dumps(a1))
|
|
assert kernel_call_count == 2
|
|
assert_identical(a1, a2)
|
|
assert not a1._in_memory
|
|
assert not a2._in_memory
|
|
assert not a1.coords["y"]._in_memory
|
|
assert not a2.coords["y"]._in_memory
|
|
|
|
def test_dataset_pickle(self):
|
|
# Test that pickling/unpickling converts the dask backend
|
|
# to numpy in neither the data variables nor the non-index coords
|
|
data = build_dask_array("data")
|
|
nonindex_coord = build_dask_array("coord")
|
|
ds1 = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)})
|
|
ds1.compute()
|
|
assert not ds1["a"]._in_memory
|
|
assert not ds1["y"]._in_memory
|
|
assert kernel_call_count == 2
|
|
ds2 = pickle.loads(pickle.dumps(ds1))
|
|
assert kernel_call_count == 2
|
|
assert_identical(ds1, ds2)
|
|
assert not ds1["a"]._in_memory
|
|
assert not ds2["a"]._in_memory
|
|
assert not ds1["y"]._in_memory
|
|
assert not ds2["y"]._in_memory
|
|
|
|
def test_dataarray_getattr(self):
|
|
# ipython/jupyter does a long list of getattr() calls to when trying to
|
|
# represent an object.
|
|
# Make sure we're not accidentally computing dask variables.
|
|
data = build_dask_array("data")
|
|
nonindex_coord = build_dask_array("coord")
|
|
a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)})
|
|
with suppress(AttributeError):
|
|
_ = a.NOTEXIST
|
|
assert kernel_call_count == 0
|
|
|
|
def test_dataset_getattr(self):
|
|
# Test that pickling/unpickling converts the dask backend
|
|
# to numpy in neither the data variables nor the non-index coords
|
|
data = build_dask_array("data")
|
|
nonindex_coord = build_dask_array("coord")
|
|
ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)})
|
|
with suppress(AttributeError):
|
|
_ = ds.NOTEXIST
|
|
assert kernel_call_count == 0
|
|
|
|
def test_values(self):
|
|
# Test that invoking the values property does not convert the dask
|
|
# backend to numpy
|
|
a = DataArray([1, 2]).chunk()
|
|
assert not a._in_memory
|
|
assert a.values.tolist() == [1, 2]
|
|
assert not a._in_memory
|
|
|
|
def test_from_dask_variable(self):
|
|
# Test array creation from Variable with dask backend.
|
|
# This is used e.g. in broadcast()
|
|
a = DataArray(self.lazy_array.variable, coords={"x": range(4)}, name="foo")
|
|
self.assertLazyAndIdentical(self.lazy_array, a)
|
|
|
|
@requires_pint
|
|
def test_tokenize_duck_dask_array(self):
|
|
import pint
|
|
|
|
unit_registry = pint.UnitRegistry()
|
|
|
|
q = unit_registry.Quantity(self.data, unit_registry.meter)
|
|
data_array = xr.DataArray(
|
|
data=q, coords={"x": range(4)}, dims=("x", "y"), name="foo"
|
|
)
|
|
|
|
token = dask.base.tokenize(data_array)
|
|
post_op = data_array + 5 * unit_registry.meter
|
|
|
|
assert dask.base.tokenize(data_array) != dask.base.tokenize(post_op)
|
|
# Immutability check
|
|
assert dask.base.tokenize(data_array) == token
|
|
|
|
|
|
class TestToDaskDataFrame:
|
|
@pytest.mark.xfail(reason="https://github.com/dask/dask/issues/11584")
|
|
def test_to_dask_dataframe(self):
|
|
# Test conversion of Datasets to dask DataFrames
|
|
x = np.random.randn(10)
|
|
y = np.arange(10, dtype="uint8")
|
|
t = list("abcdefghij")
|
|
|
|
ds = Dataset(
|
|
{"a": ("t", da.from_array(x, chunks=4)), "b": ("t", y), "t": ("t", t)}
|
|
)
|
|
|
|
expected_pd = pd.DataFrame({"a": x, "b": y}, index=pd.Index(t, name="t"))
|
|
|
|
# test if 1-D index is correctly set up
|
|
expected = dd.from_pandas(expected_pd, chunksize=4)
|
|
actual = ds.to_dask_dataframe(set_index=True)
|
|
# test if we have dask dataframes
|
|
assert isinstance(actual, dd.DataFrame)
|
|
|
|
# use the .equals from pandas to check dataframes are equivalent
|
|
assert_frame_equal(actual.compute(), expected.compute())
|
|
|
|
# test if no index is given
|
|
expected = dd.from_pandas(expected_pd.reset_index(drop=False), chunksize=4)
|
|
|
|
actual = ds.to_dask_dataframe(set_index=False)
|
|
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(actual.compute(), expected.compute())
|
|
|
|
@pytest.mark.xfail(
|
|
reason="Currently pandas with pyarrow installed will return a `string[pyarrow]` type, "
|
|
"which causes the `y` column to have a different type depending on whether pyarrow is installed"
|
|
)
|
|
def test_to_dask_dataframe_2D(self):
|
|
# Test if 2-D dataset is supplied
|
|
w = np.random.randn(2, 3)
|
|
ds = Dataset({"w": (("x", "y"), da.from_array(w, chunks=(1, 2)))})
|
|
ds["x"] = ("x", np.array([0, 1], np.int64))
|
|
ds["y"] = ("y", list("abc"))
|
|
|
|
# dask dataframes do not (yet) support multiindex,
|
|
# but when it does, this would be the expected index:
|
|
exp_index = pd.MultiIndex.from_arrays(
|
|
[[0, 0, 0, 1, 1, 1], ["a", "b", "c", "a", "b", "c"]], names=["x", "y"]
|
|
)
|
|
expected = pd.DataFrame({"w": w.reshape(-1)}, index=exp_index)
|
|
# so for now, reset the index
|
|
expected = expected.reset_index(drop=False)
|
|
actual = ds.to_dask_dataframe(set_index=False)
|
|
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(actual.compute(), expected)
|
|
|
|
@pytest.mark.xfail(raises=NotImplementedError)
|
|
def test_to_dask_dataframe_2D_set_index(self):
|
|
# This will fail until dask implements MultiIndex support
|
|
w = da.from_array(np.random.randn(2, 3), chunks=(1, 2))
|
|
ds = Dataset({"w": (("x", "y"), w)})
|
|
ds["x"] = ("x", np.array([0, 1], np.int64))
|
|
ds["y"] = ("y", list("abc"))
|
|
|
|
expected = ds.compute().to_dataframe()
|
|
actual = ds.to_dask_dataframe(set_index=True)
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(expected, actual.compute())
|
|
|
|
def test_to_dask_dataframe_coordinates(self):
|
|
# Test if coordinate is also a dask array
|
|
x = np.random.randn(10)
|
|
t = np.arange(10) * 2
|
|
|
|
ds = Dataset(
|
|
{
|
|
"a": ("t", da.from_array(x, chunks=4)),
|
|
"t": ("t", da.from_array(t, chunks=4)),
|
|
}
|
|
)
|
|
|
|
expected_pd = pd.DataFrame({"a": x}, index=pd.Index(t, name="t"))
|
|
expected = dd.from_pandas(expected_pd, chunksize=4)
|
|
actual = ds.to_dask_dataframe(set_index=True)
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(expected.compute(), actual.compute())
|
|
|
|
@pytest.mark.xfail(
|
|
reason="Currently pandas with pyarrow installed will return a `string[pyarrow]` type, "
|
|
"which causes the index to have a different type depending on whether pyarrow is installed"
|
|
)
|
|
def test_to_dask_dataframe_not_daskarray(self):
|
|
# Test if DataArray is not a dask array
|
|
x = np.random.randn(10)
|
|
y = np.arange(10, dtype="uint8")
|
|
t = list("abcdefghij")
|
|
|
|
ds = Dataset({"a": ("t", x), "b": ("t", y), "t": ("t", t)})
|
|
|
|
expected = pd.DataFrame({"a": x, "b": y}, index=pd.Index(t, name="t"))
|
|
|
|
actual = ds.to_dask_dataframe(set_index=True)
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(expected, actual.compute())
|
|
|
|
def test_to_dask_dataframe_no_coordinate(self):
|
|
x = da.from_array(np.random.randn(10), chunks=4)
|
|
ds = Dataset({"x": ("dim_0", x)})
|
|
|
|
expected = ds.compute().to_dataframe().reset_index()
|
|
actual = ds.to_dask_dataframe()
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(expected, actual.compute())
|
|
|
|
expected = ds.compute().to_dataframe()
|
|
actual = ds.to_dask_dataframe(set_index=True)
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(expected, actual.compute())
|
|
|
|
def test_to_dask_dataframe_dim_order(self):
|
|
values = np.array([[1, 2], [3, 4]], dtype=np.int64)
|
|
ds = Dataset({"w": (("x", "y"), values)}).chunk(1)
|
|
|
|
expected = ds["w"].to_series().reset_index()
|
|
actual = ds.to_dask_dataframe(dim_order=["x", "y"])
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(expected, actual.compute())
|
|
|
|
expected = ds["w"].T.to_series().reset_index()
|
|
actual = ds.to_dask_dataframe(dim_order=["y", "x"])
|
|
assert isinstance(actual, dd.DataFrame)
|
|
assert_frame_equal(expected, actual.compute())
|
|
|
|
with pytest.raises(ValueError, match=r"does not match the set of dimensions"):
|
|
ds.to_dask_dataframe(dim_order=["x"])
|
|
|
|
|
|
@pytest.mark.parametrize("method", ["load", "compute"])
|
|
def test_dask_kwargs_variable(method):
|
|
chunked_array = da.from_array(np.arange(3), chunks=(2,))
|
|
x = Variable("y", chunked_array)
|
|
# args should be passed on to dask.compute() (via DaskManager.compute())
|
|
with mock.patch.object(da, "compute", return_value=(np.arange(3),)) as mock_compute:
|
|
getattr(x, method)(foo="bar")
|
|
mock_compute.assert_called_with(chunked_array, foo="bar")
|
|
|
|
|
|
@pytest.mark.parametrize("method", ["load", "compute", "persist"])
|
|
def test_dask_kwargs_dataarray(method):
|
|
data = da.from_array(np.arange(3), chunks=(2,))
|
|
x = DataArray(data)
|
|
if method in ["load", "compute"]:
|
|
dask_func = "dask.array.compute"
|
|
else:
|
|
dask_func = "dask.persist"
|
|
# args should be passed on to "dask_func"
|
|
with mock.patch(dask_func) as mock_func:
|
|
getattr(x, method)(foo="bar")
|
|
mock_func.assert_called_with(data, foo="bar")
|
|
|
|
|
|
@pytest.mark.parametrize("method", ["load", "compute", "persist"])
|
|
def test_dask_kwargs_dataset(method):
|
|
data = da.from_array(np.arange(3), chunks=(2,))
|
|
x = Dataset({"x": (("y"), data)})
|
|
if method in ["load", "compute"]:
|
|
dask_func = "dask.array.compute"
|
|
else:
|
|
dask_func = "dask.persist"
|
|
# args should be passed on to "dask_func"
|
|
with mock.patch(dask_func) as mock_func:
|
|
getattr(x, method)(foo="bar")
|
|
mock_func.assert_called_with(data, foo="bar")
|
|
|
|
|
|
kernel_call_count = 0
|
|
|
|
|
|
def kernel(name):
|
|
"""Dask kernel to test pickling/unpickling and __repr__.
|
|
Must be global to make it pickleable.
|
|
"""
|
|
global kernel_call_count
|
|
kernel_call_count += 1
|
|
return np.ones(1, dtype=np.int64)
|
|
|
|
|
|
def build_dask_array(name):
|
|
global kernel_call_count
|
|
kernel_call_count = 0
|
|
return dask.array.Array(
|
|
dask={(name, 0): (kernel, name)}, name=name, chunks=((1,),), dtype=np.int64
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"persist", [lambda x: x.persist(), lambda x: dask.persist(x)[0]]
|
|
)
|
|
def test_persist_Dataset(persist):
|
|
ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()
|
|
ds = ds + 1
|
|
n = len(ds.foo.data.dask)
|
|
|
|
ds2 = persist(ds)
|
|
|
|
assert len(ds2.foo.data.dask) == 1
|
|
assert len(ds.foo.data.dask) == n # doesn't mutate in place
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"persist", [lambda x: x.persist(), lambda x: dask.persist(x)[0]]
|
|
)
|
|
def test_persist_DataArray(persist):
|
|
x = da.arange(10, chunks=(5,))
|
|
y = DataArray(x)
|
|
z = y + 1
|
|
n = len(z.data.dask)
|
|
|
|
zz = persist(z)
|
|
|
|
assert len(z.data.dask) == n
|
|
assert len(zz.data.dask) == zz.data.npartitions
|
|
|
|
|
|
def test_dataarray_with_dask_coords():
|
|
import toolz
|
|
|
|
x = xr.Variable("x", da.arange(8, chunks=(4,)))
|
|
y = xr.Variable("y", da.arange(8, chunks=(4,)) * 2)
|
|
data = da.random.random((8, 8), chunks=(4, 4)) + 1
|
|
array = xr.DataArray(data, dims=["x", "y"])
|
|
array.coords["xx"] = x
|
|
array.coords["yy"] = y
|
|
|
|
assert dict(array.__dask_graph__()) == toolz.merge(
|
|
data.__dask_graph__(), x.__dask_graph__(), y.__dask_graph__()
|
|
)
|
|
|
|
(array2,) = dask.compute(array)
|
|
assert not dask.is_dask_collection(array2)
|
|
|
|
assert all(isinstance(v._variable.data, np.ndarray) for v in array2.coords.values())
|
|
|
|
|
|
def test_basic_compute():
|
|
ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk({"x": 2})
|
|
for get in [dask.threaded.get, dask.multiprocessing.get, dask.local.get_sync, None]:
|
|
with dask.config.set(scheduler=get):
|
|
ds.compute()
|
|
ds.foo.compute()
|
|
ds.foo.variable.compute()
|
|
|
|
|
|
def test_dask_layers_and_dependencies():
|
|
ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()
|
|
|
|
x = dask.delayed(ds)
|
|
assert set(x.__dask_graph__().dependencies).issuperset(
|
|
ds.__dask_graph__().dependencies
|
|
)
|
|
assert set(x.foo.__dask_graph__().dependencies).issuperset(
|
|
ds.__dask_graph__().dependencies
|
|
)
|
|
|
|
|
|
def make_da():
|
|
da = xr.DataArray(
|
|
np.ones((10, 20)),
|
|
dims=["x", "y"],
|
|
coords={"x": np.arange(10), "y": np.arange(100, 120)},
|
|
name="a",
|
|
).chunk({"x": 4, "y": 5})
|
|
da.x.attrs["long_name"] = "x"
|
|
da.attrs["test"] = "test"
|
|
da.coords["c2"] = 0.5
|
|
da.coords["ndcoord"] = da.x * 2
|
|
da.coords["cxy"] = (da.x * da.y).chunk({"x": 4, "y": 5})
|
|
|
|
return da
|
|
|
|
|
|
def make_ds():
|
|
map_ds = xr.Dataset()
|
|
map_ds["a"] = make_da()
|
|
map_ds["b"] = map_ds.a + 50
|
|
map_ds["c"] = map_ds.x + 20
|
|
map_ds = map_ds.chunk({"x": 4, "y": 5})
|
|
map_ds["d"] = ("z", [1, 1, 1, 1])
|
|
map_ds["z"] = [0, 1, 2, 3]
|
|
map_ds["e"] = map_ds.x + map_ds.y
|
|
map_ds.coords["c1"] = 0.5
|
|
map_ds.coords["cx"] = ("x", np.arange(len(map_ds.x)))
|
|
map_ds.coords["cx"].attrs["test2"] = "test2"
|
|
map_ds.attrs["test"] = "test"
|
|
map_ds.coords["xx"] = map_ds["a"] * map_ds.y
|
|
|
|
map_ds.x.attrs["long_name"] = "x"
|
|
map_ds.y.attrs["long_name"] = "y"
|
|
|
|
return map_ds
|
|
|
|
|
|
# fixtures cannot be used in parametrize statements
|
|
# instead use this workaround
|
|
# https://docs.pytest.org/en/latest/deprecations.html#calling-fixtures-directly
|
|
@pytest.fixture
|
|
def map_da():
|
|
return make_da()
|
|
|
|
|
|
@pytest.fixture
|
|
def map_ds():
|
|
return make_ds()
|
|
|
|
|
|
def test_unify_chunks(map_ds):
|
|
ds_copy = map_ds.copy()
|
|
ds_copy["cxy"] = ds_copy.cxy.chunk({"y": 10})
|
|
|
|
with pytest.raises(ValueError, match=r"inconsistent chunks"):
|
|
_ = ds_copy.chunks
|
|
|
|
expected_chunks = {"x": (4, 4, 2), "y": (5, 5, 5, 5)}
|
|
with raise_if_dask_computes():
|
|
actual_chunks = ds_copy.unify_chunks().chunks
|
|
assert actual_chunks == expected_chunks
|
|
assert_identical(map_ds, ds_copy.unify_chunks())
|
|
|
|
out_a, out_b = xr.unify_chunks(ds_copy.cxy, ds_copy.drop_vars("cxy"))
|
|
assert out_a.chunks == ((4, 4, 2), (5, 5, 5, 5))
|
|
assert out_b.chunks == expected_chunks
|
|
|
|
# Test unordered dims
|
|
da = ds_copy["cxy"]
|
|
out_a, out_b = xr.unify_chunks(da.chunk({"x": -1}), da.T.chunk({"y": -1}))
|
|
assert out_a.chunks == ((4, 4, 2), (5, 5, 5, 5))
|
|
assert out_b.chunks == ((5, 5, 5, 5), (4, 4, 2))
|
|
|
|
# Test mismatch
|
|
with pytest.raises(ValueError, match=r"Dimension 'x' size mismatch: 10 != 2"):
|
|
xr.unify_chunks(da, da.isel(x=slice(2)))
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_ds(), make_da()])
|
|
@pytest.mark.parametrize(
|
|
"transform", [lambda x: x.compute(), lambda x: x.unify_chunks()]
|
|
)
|
|
def test_unify_chunks_shallow_copy(obj, transform):
|
|
obj = transform(obj)
|
|
unified = obj.unify_chunks()
|
|
assert_identical(obj, unified) and obj is not obj.unify_chunks()
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da()])
|
|
def test_auto_chunk_da(obj):
|
|
actual = obj.chunk("auto").data
|
|
expected = obj.data.rechunk("auto")
|
|
np.testing.assert_array_equal(actual, expected)
|
|
assert actual.chunks == expected.chunks
|
|
|
|
|
|
def test_map_blocks_error(map_da, map_ds):
|
|
def bad_func(darray):
|
|
return (darray * darray.x + 5 * darray.y)[:1, :1]
|
|
|
|
with pytest.raises(ValueError, match=r"Received dimension 'x' of length 1"):
|
|
xr.map_blocks(bad_func, map_da).compute()
|
|
|
|
def returns_numpy(darray):
|
|
return (darray * darray.x + 5 * darray.y).values
|
|
|
|
with pytest.raises(TypeError, match=r"Function must return an xarray DataArray"):
|
|
xr.map_blocks(returns_numpy, map_da)
|
|
|
|
with pytest.raises(TypeError, match=r"args must be"):
|
|
xr.map_blocks(operator.add, map_da, args=10)
|
|
|
|
with pytest.raises(TypeError, match=r"kwargs must be"):
|
|
xr.map_blocks(operator.add, map_da, args=[10], kwargs=[20])
|
|
|
|
def really_bad_func(darray):
|
|
raise ValueError("couldn't do anything.")
|
|
|
|
with pytest.raises(Exception, match=r"Cannot infer"):
|
|
xr.map_blocks(really_bad_func, map_da)
|
|
|
|
ds_copy = map_ds.copy()
|
|
ds_copy["cxy"] = ds_copy.cxy.chunk({"y": 10})
|
|
|
|
with pytest.raises(ValueError, match=r"inconsistent chunks"):
|
|
xr.map_blocks(bad_func, ds_copy)
|
|
|
|
with pytest.raises(TypeError, match=r"Cannot pass dask collections"):
|
|
xr.map_blocks(bad_func, map_da, kwargs=dict(a=map_da.chunk()))
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks(obj):
|
|
def func(obj):
|
|
result = obj + obj.x + 5 * obj.y
|
|
return result
|
|
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(func, obj)
|
|
expected = func(obj)
|
|
assert_chunks_equal(expected.chunk(), actual)
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks_mixed_type_inputs(obj):
|
|
def func(obj1, non_xarray_input, obj2):
|
|
result = obj1 + obj1.x + 5 * obj1.y
|
|
return result
|
|
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(func, obj, args=["non_xarray_input", obj])
|
|
expected = func(obj, "non_xarray_input", obj)
|
|
assert_chunks_equal(expected.chunk(), actual)
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks_convert_args_to_list(obj):
|
|
expected = obj + 10
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(operator.add, obj, [10])
|
|
assert_chunks_equal(expected.chunk(), actual)
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_map_blocks_dask_args():
|
|
da1 = xr.DataArray(
|
|
np.ones((10, 20)),
|
|
dims=["x", "y"],
|
|
coords={"x": np.arange(10), "y": np.arange(20)},
|
|
).chunk({"x": 5, "y": 4})
|
|
|
|
# check that block shapes are the same
|
|
def sumda(da1, da2):
|
|
assert da1.shape == da2.shape
|
|
return da1 + da2
|
|
|
|
da2 = da1 + 1
|
|
with raise_if_dask_computes():
|
|
mapped = xr.map_blocks(sumda, da1, args=[da2])
|
|
xr.testing.assert_equal(da1 + da2, mapped)
|
|
|
|
# one dimension in common
|
|
da2 = (da1 + 1).isel(x=1, drop=True)
|
|
with raise_if_dask_computes():
|
|
mapped = xr.map_blocks(operator.add, da1, args=[da2])
|
|
xr.testing.assert_equal(da1 + da2, mapped)
|
|
|
|
# test that everything works when dimension names are different
|
|
da2 = (da1 + 1).isel(x=1, drop=True).rename({"y": "k"})
|
|
with raise_if_dask_computes():
|
|
mapped = xr.map_blocks(operator.add, da1, args=[da2])
|
|
xr.testing.assert_equal(da1 + da2, mapped)
|
|
|
|
with pytest.raises(ValueError, match=r"Chunk sizes along dimension 'x'"):
|
|
xr.map_blocks(operator.add, da1, args=[da1.chunk({"x": 1})])
|
|
|
|
with pytest.raises(ValueError, match=r"cannot align.*index.*are not equal"):
|
|
xr.map_blocks(operator.add, da1, args=[da1.reindex(x=np.arange(20))])
|
|
|
|
# reduction
|
|
da1 = da1.chunk({"x": -1})
|
|
da2 = da1 + 1
|
|
with raise_if_dask_computes():
|
|
mapped = xr.map_blocks(lambda a, b: (a + b).sum("x"), da1, args=[da2])
|
|
xr.testing.assert_equal((da1 + da2).sum("x"), mapped)
|
|
|
|
# reduction with template
|
|
da1 = da1.chunk({"x": -1})
|
|
da2 = da1 + 1
|
|
with raise_if_dask_computes():
|
|
mapped = xr.map_blocks(
|
|
lambda a, b: (a + b).sum("x"), da1, args=[da2], template=da1.sum("x")
|
|
)
|
|
xr.testing.assert_equal((da1 + da2).sum("x"), mapped)
|
|
|
|
# bad template: not chunked
|
|
with pytest.raises(ValueError, match="Provided template has no dask arrays"):
|
|
xr.map_blocks(
|
|
lambda a, b: (a + b).sum("x"),
|
|
da1,
|
|
args=[da2],
|
|
template=da1.sum("x").compute(),
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks_add_attrs(obj):
|
|
def add_attrs(obj):
|
|
obj = obj.copy(deep=True)
|
|
obj.attrs["new"] = "new"
|
|
obj.cxy.attrs["new2"] = "new2"
|
|
return obj
|
|
|
|
expected = add_attrs(obj)
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(add_attrs, obj)
|
|
|
|
assert_identical(actual, expected)
|
|
|
|
# when template is specified, attrs are copied from template, not set by function
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(add_attrs, obj, template=obj)
|
|
assert_identical(actual, obj)
|
|
|
|
|
|
def test_map_blocks_change_name(map_da):
|
|
def change_name(obj):
|
|
obj = obj.copy(deep=True)
|
|
obj.name = "new"
|
|
return obj
|
|
|
|
expected = change_name(map_da)
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(change_name, map_da)
|
|
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks_kwargs(obj):
|
|
expected = xr.full_like(obj, fill_value=np.nan)
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(xr.full_like, obj, kwargs=dict(fill_value=np.nan))
|
|
assert_chunks_equal(expected.chunk(), actual)
|
|
assert_identical(actual, expected)
|
|
|
|
|
|
def test_map_blocks_to_dataarray(map_ds):
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(lambda x: x.to_dataarray(), map_ds)
|
|
|
|
# to_dataarray does not preserve name, so cannot use assert_identical
|
|
assert_equal(actual, map_ds.to_dataarray())
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"func",
|
|
[
|
|
lambda x: x,
|
|
lambda x: x.to_dataset(),
|
|
lambda x: x.drop_vars("x"),
|
|
lambda x: x.expand_dims(k=[1, 2, 3]),
|
|
lambda x: x.expand_dims(k=3),
|
|
lambda x: x.assign_coords(new_coord=("y", x.y.data * 2)),
|
|
lambda x: x.astype(np.int32),
|
|
lambda x: x.x,
|
|
],
|
|
)
|
|
def test_map_blocks_da_transformations(func, map_da):
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(func, map_da)
|
|
|
|
assert_identical(actual, func(map_da))
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"func",
|
|
[
|
|
lambda x: x,
|
|
lambda x: x.drop_vars("cxy"),
|
|
lambda x: x.drop_vars("a"),
|
|
lambda x: x.drop_vars("x"),
|
|
lambda x: x.expand_dims(k=[1, 2, 3]),
|
|
lambda x: x.expand_dims(k=3),
|
|
lambda x: x.rename({"a": "new1", "b": "new2"}),
|
|
lambda x: x.x,
|
|
],
|
|
)
|
|
def test_map_blocks_ds_transformations(func, map_ds):
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(func, map_ds)
|
|
|
|
assert_identical(actual, func(map_ds))
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks_da_ds_with_template(obj):
|
|
func = lambda x: x.isel(x=[1])
|
|
# a simple .isel(x=[1, 5, 9]) puts all those in a single chunk.
|
|
template = xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x")
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(func, obj, template=template)
|
|
assert_identical(actual, template)
|
|
|
|
# Check that indexes are written into the graph directly
|
|
dsk = dict(actual.__dask_graph__())
|
|
assert len({k for k in dsk if "x-coordinate" in k})
|
|
assert all(
|
|
isinstance(v, PandasIndex) for k, v in dsk.items() if "x-coordinate" in k
|
|
)
|
|
|
|
with raise_if_dask_computes():
|
|
actual = obj.map_blocks(func, template=template)
|
|
assert_identical(actual, template)
|
|
|
|
|
|
def test_map_blocks_roundtrip_string_index():
|
|
ds = xr.Dataset(
|
|
{"data": (["label"], [1, 2, 3])}, coords={"label": ["foo", "bar", "baz"]}
|
|
).chunk(label=1)
|
|
assert ds.label.dtype == np.dtype("=U3")
|
|
|
|
mapped = ds.map_blocks(lambda x: x, template=ds)
|
|
assert mapped.label.dtype == ds.label.dtype
|
|
|
|
mapped = ds.map_blocks(lambda x: x, template=None)
|
|
assert mapped.label.dtype == ds.label.dtype
|
|
|
|
mapped = ds.data.map_blocks(lambda x: x, template=ds.data)
|
|
assert mapped.label.dtype == ds.label.dtype
|
|
|
|
mapped = ds.data.map_blocks(lambda x: x, template=None)
|
|
assert mapped.label.dtype == ds.label.dtype
|
|
|
|
|
|
def test_map_blocks_template_convert_object():
|
|
da = make_da()
|
|
ds = da.to_dataset()
|
|
|
|
func = lambda x: x.to_dataset().isel(x=[1])
|
|
template = xr.concat([da.to_dataset().isel(x=[i]) for i in [1, 5, 9]], dim="x")
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(func, da, template=template)
|
|
assert_identical(actual, template)
|
|
|
|
func = lambda x: x.to_dataarray().isel(x=[1])
|
|
template = xr.concat([ds.to_dataarray().isel(x=[i]) for i in [1, 5, 9]], dim="x")
|
|
with raise_if_dask_computes():
|
|
actual = xr.map_blocks(func, ds, template=template)
|
|
assert_identical(actual, template)
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks_errors_bad_template(obj):
|
|
with pytest.raises(ValueError, match=r"unexpected coordinate variables"):
|
|
xr.map_blocks(lambda x: x.assign_coords(a=10), obj, template=obj).compute()
|
|
with pytest.raises(ValueError, match=r"does not contain coordinate variables"):
|
|
xr.map_blocks(lambda x: x.drop_vars("cxy"), obj, template=obj).compute()
|
|
with pytest.raises(ValueError, match=r"Dimensions {'x'} missing"):
|
|
xr.map_blocks(lambda x: x.isel(x=1), obj, template=obj).compute()
|
|
with pytest.raises(ValueError, match=r"Received dimension 'x' of length 1"):
|
|
xr.map_blocks(lambda x: x.isel(x=[1]), obj, template=obj).compute()
|
|
with pytest.raises(TypeError, match=r"must be a DataArray"):
|
|
xr.map_blocks(lambda x: x.isel(x=[1]), obj, template=(obj,)).compute()
|
|
with pytest.raises(ValueError, match=r"map_blocks requires that one block"):
|
|
xr.map_blocks(
|
|
lambda x: x.isel(x=[1]).assign_coords(x=10), obj, template=obj.isel(x=[1])
|
|
).compute()
|
|
with pytest.raises(ValueError, match=r"Expected index 'x' to be"):
|
|
xr.map_blocks(
|
|
lambda a: a.isel(x=[1]).assign_coords(x=[120]), # assign bad index values
|
|
obj,
|
|
template=xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x"),
|
|
).compute()
|
|
|
|
|
|
def test_map_blocks_errors_bad_template_2(map_ds):
|
|
with pytest.raises(ValueError, match=r"unexpected data variables {'xyz'}"):
|
|
xr.map_blocks(lambda x: x.assign(xyz=1), map_ds, template=map_ds).compute()
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
def test_map_blocks_object_method(obj):
|
|
def func(obj):
|
|
result = obj + obj.x + 5 * obj.y
|
|
return result
|
|
|
|
with raise_if_dask_computes():
|
|
expected = xr.map_blocks(func, obj)
|
|
actual = obj.map_blocks(func)
|
|
|
|
assert_identical(expected, actual)
|
|
|
|
|
|
def test_map_blocks_hlg_layers():
|
|
# regression test for #3599
|
|
ds = xr.Dataset(
|
|
{
|
|
"x": (("a",), dask.array.ones(10, chunks=(5,))),
|
|
"z": (("b",), dask.array.ones(10, chunks=(5,))),
|
|
}
|
|
)
|
|
mapped = ds.map_blocks(lambda x: x)
|
|
|
|
xr.testing.assert_equal(mapped, ds)
|
|
|
|
|
|
def test_make_meta(map_ds):
|
|
from xarray.core.parallel import make_meta
|
|
|
|
meta = make_meta(map_ds)
|
|
|
|
for variable in map_ds._coord_names:
|
|
assert variable in meta._coord_names
|
|
assert meta.coords[variable].shape == (0,) * meta.coords[variable].ndim
|
|
|
|
for variable in map_ds.data_vars:
|
|
assert variable in meta.data_vars
|
|
assert meta.data_vars[variable].shape == (0,) * meta.data_vars[variable].ndim
|
|
|
|
|
|
def test_identical_coords_no_computes():
|
|
lons2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
|
|
a = xr.DataArray(
|
|
da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
|
|
)
|
|
b = xr.DataArray(
|
|
da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
|
|
)
|
|
with raise_if_dask_computes():
|
|
c = a + b
|
|
assert_identical(c, a)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()]
|
|
)
|
|
@pytest.mark.parametrize(
|
|
"transform",
|
|
[
|
|
lambda x: x.reset_coords(),
|
|
lambda x: x.reset_coords(drop=True),
|
|
lambda x: x.isel(x=1),
|
|
lambda x: x.attrs.update(new_attrs=1),
|
|
lambda x: x.assign_coords(cxy=1),
|
|
lambda x: x.rename({"x": "xnew"}),
|
|
lambda x: x.rename({"cxy": "cxynew"}),
|
|
],
|
|
)
|
|
def test_token_changes_on_transform(obj, transform):
|
|
with raise_if_dask_computes():
|
|
assert dask.base.tokenize(obj) != dask.base.tokenize(transform(obj))
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()]
|
|
)
|
|
def test_token_changes_when_data_changes(obj):
|
|
with raise_if_dask_computes():
|
|
t1 = dask.base.tokenize(obj)
|
|
|
|
# Change data_var
|
|
if isinstance(obj, DataArray):
|
|
obj *= 2
|
|
else:
|
|
obj["a"] *= 2
|
|
with raise_if_dask_computes():
|
|
t2 = dask.base.tokenize(obj)
|
|
assert t2 != t1
|
|
|
|
# Change non-index coord
|
|
obj.coords["ndcoord"] *= 2
|
|
with raise_if_dask_computes():
|
|
t3 = dask.base.tokenize(obj)
|
|
assert t3 != t2
|
|
|
|
# Change IndexVariable
|
|
obj = obj.assign_coords(x=obj.x * 2)
|
|
with raise_if_dask_computes():
|
|
t4 = dask.base.tokenize(obj)
|
|
assert t4 != t3
|
|
|
|
|
|
@pytest.mark.parametrize("obj", [make_da().compute(), make_ds().compute()])
|
|
def test_token_changes_when_buffer_changes(obj):
|
|
with raise_if_dask_computes():
|
|
t1 = dask.base.tokenize(obj)
|
|
|
|
if isinstance(obj, DataArray):
|
|
obj[0, 0] = 123
|
|
else:
|
|
obj["a"][0, 0] = 123
|
|
with raise_if_dask_computes():
|
|
t2 = dask.base.tokenize(obj)
|
|
assert t2 != t1
|
|
|
|
obj.coords["ndcoord"][0] = 123
|
|
with raise_if_dask_computes():
|
|
t3 = dask.base.tokenize(obj)
|
|
assert t3 != t2
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"transform",
|
|
[lambda x: x, lambda x: x.copy(deep=False), lambda x: x.copy(deep=True)],
|
|
)
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds(), make_ds().variables["a"]])
|
|
def test_token_identical(obj, transform):
|
|
with raise_if_dask_computes():
|
|
assert dask.base.tokenize(obj) == dask.base.tokenize(transform(obj))
|
|
assert dask.base.tokenize(obj.compute()) == dask.base.tokenize(
|
|
transform(obj.compute())
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"obj",
|
|
[
|
|
make_ds(), # Dataset
|
|
make_ds().variables["c2"], # Variable
|
|
make_ds().variables["x"], # IndexVariable
|
|
],
|
|
)
|
|
def test_tokenize_empty_attrs(obj):
|
|
"""Issues #6970 and #8788"""
|
|
obj.attrs = {}
|
|
assert obj._attrs is None
|
|
a = dask.base.tokenize(obj)
|
|
|
|
assert obj.attrs == {}
|
|
assert obj._attrs == {} # attrs getter changed None to dict
|
|
b = dask.base.tokenize(obj)
|
|
assert a == b
|
|
|
|
obj2 = obj.copy()
|
|
c = dask.base.tokenize(obj2)
|
|
assert a == c
|
|
|
|
|
|
def test_recursive_token():
|
|
"""Test that tokenization is invoked recursively, and doesn't just rely on the
|
|
output of str()
|
|
"""
|
|
a = np.ones(10000)
|
|
b = np.ones(10000)
|
|
b[5000] = 2
|
|
assert str(a) == str(b)
|
|
assert dask.base.tokenize(a) != dask.base.tokenize(b)
|
|
|
|
# Test DataArray and Variable
|
|
da_a = DataArray(a)
|
|
da_b = DataArray(b)
|
|
assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b)
|
|
|
|
# Test Dataset
|
|
ds_a = da_a.to_dataset(name="x")
|
|
ds_b = da_b.to_dataset(name="x")
|
|
assert dask.base.tokenize(ds_a) != dask.base.tokenize(ds_b)
|
|
|
|
# Test IndexVariable
|
|
da_a = DataArray(a, dims=["x"], coords={"x": a})
|
|
da_b = DataArray(a, dims=["x"], coords={"x": b})
|
|
assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b)
|
|
|
|
|
|
@requires_scipy_or_netCDF4
|
|
def test_normalize_token_with_backend(map_ds):
|
|
with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file:
|
|
map_ds.to_netcdf(tmp_file)
|
|
read = xr.open_dataset(tmp_file)
|
|
assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read)
|
|
read.close()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"compat", ["broadcast_equals", "equals", "identical", "no_conflicts"]
|
|
)
|
|
def test_lazy_array_equiv_variables(compat):
|
|
var1 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2))
|
|
var2 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2))
|
|
var3 = xr.Variable(("y", "x"), da.zeros((20, 10), chunks=2))
|
|
|
|
with raise_if_dask_computes():
|
|
assert getattr(var1, compat)(var2, equiv=lazy_array_equiv)
|
|
# values are actually equal, but we don't know that till we compute, return None
|
|
with raise_if_dask_computes():
|
|
assert getattr(var1, compat)(var2 / 2, equiv=lazy_array_equiv) is None
|
|
|
|
# shapes are not equal, return False without computes
|
|
with raise_if_dask_computes():
|
|
assert getattr(var1, compat)(var3, equiv=lazy_array_equiv) is False
|
|
|
|
# if one or both arrays are numpy, return None
|
|
assert getattr(var1, compat)(var2.compute(), equiv=lazy_array_equiv) is None
|
|
assert (
|
|
getattr(var1.compute(), compat)(var2.compute(), equiv=lazy_array_equiv) is None
|
|
)
|
|
|
|
with raise_if_dask_computes():
|
|
assert getattr(var1, compat)(var2.transpose("y", "x"))
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"compat", ["broadcast_equals", "equals", "identical", "no_conflicts"]
|
|
)
|
|
def test_lazy_array_equiv_merge(compat):
|
|
da1 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
|
|
da2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
|
|
da3 = xr.DataArray(da.ones((20, 10), chunks=2), dims=("y", "x"))
|
|
|
|
with raise_if_dask_computes():
|
|
xr.merge([da1, da2], compat=compat)
|
|
# shapes are not equal; no computes necessary
|
|
with raise_if_dask_computes(max_computes=0):
|
|
with pytest.raises(ValueError):
|
|
xr.merge([da1, da3], compat=compat)
|
|
with raise_if_dask_computes(max_computes=2):
|
|
xr.merge([da1, da2 / 2], compat=compat)
|
|
|
|
|
|
@pytest.mark.filterwarnings("ignore::FutureWarning") # transpose_coords
|
|
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
|
|
@pytest.mark.parametrize(
|
|
"transform",
|
|
[
|
|
lambda a: a.assign_attrs(new_attr="anew"),
|
|
lambda a: a.assign_coords(cxy=a.cxy),
|
|
lambda a: a.copy(),
|
|
lambda a: a.isel(x=slice(None)),
|
|
lambda a: a.loc[dict(x=slice(None))],
|
|
lambda a: a.transpose(...),
|
|
lambda a: a.squeeze(), # no dimensions to squeeze
|
|
lambda a: a.reindex(x=a.x),
|
|
lambda a: a.reindex_like(a),
|
|
lambda a: a.rename({"cxy": "cnew"}).rename({"cnew": "cxy"}),
|
|
lambda a: a.pipe(lambda x: x),
|
|
lambda a: xr.align(a, xr.zeros_like(a))[0],
|
|
# assign
|
|
# swap_dims
|
|
# set_index / reset_index
|
|
],
|
|
)
|
|
def test_transforms_pass_lazy_array_equiv(obj, transform):
|
|
with raise_if_dask_computes():
|
|
assert_equal(obj, transform(obj))
|
|
|
|
|
|
def test_more_transforms_pass_lazy_array_equiv(map_da, map_ds):
|
|
with raise_if_dask_computes():
|
|
assert_equal(map_ds.cxy.broadcast_like(map_ds.cxy), map_ds.cxy)
|
|
assert_equal(xr.broadcast(map_ds.cxy, map_ds.cxy)[0], map_ds.cxy)
|
|
assert_equal(map_ds.map(lambda x: x), map_ds)
|
|
assert_equal(map_ds.set_coords("a").reset_coords("a"), map_ds)
|
|
assert_equal(map_ds.assign({"a": map_ds.a}), map_ds)
|
|
|
|
# fails because of index error
|
|
# assert_equal(
|
|
# map_ds.rename_dims({"x": "xnew"}).rename_dims({"xnew": "x"}), map_ds
|
|
# )
|
|
|
|
assert_equal(
|
|
map_ds.rename_vars({"cxy": "cnew"}).rename_vars({"cnew": "cxy"}), map_ds
|
|
)
|
|
|
|
assert_equal(map_da._from_temp_dataset(map_da._to_temp_dataset()), map_da)
|
|
assert_equal(map_da.astype(map_da.dtype), map_da)
|
|
assert_equal(map_da.transpose("y", "x", transpose_coords=False).cxy, map_da.cxy)
|
|
|
|
|
|
def test_optimize():
|
|
# https://github.com/pydata/xarray/issues/3698
|
|
a = dask.array.ones((10, 4), chunks=(5, 2))
|
|
arr = xr.DataArray(a).chunk(5)
|
|
(arr2,) = dask.optimize(arr)
|
|
arr2.compute()
|
|
|
|
|
|
def test_graph_manipulation():
|
|
"""dask.graph_manipulation passes an optional parameter, "rename", to the rebuilder
|
|
function returned by __dask_postperist__; also, the dsk passed to the rebuilder is
|
|
a HighLevelGraph whereas with dask.persist() and dask.optimize() it's a plain dict.
|
|
"""
|
|
import dask.graph_manipulation as gm
|
|
|
|
v = Variable(["x"], [1, 2]).chunk(-1).chunk(1) * 2
|
|
da = DataArray(v)
|
|
ds = Dataset({"d1": v[0], "d2": v[1], "d3": ("x", [3, 4])})
|
|
|
|
v2, da2, ds2 = gm.clone(v, da, ds)
|
|
|
|
assert_equal(v2, v)
|
|
assert_equal(da2, da)
|
|
assert_equal(ds2, ds)
|
|
|
|
for a, b in ((v, v2), (da, da2), (ds, ds2)):
|
|
assert a.__dask_layers__() != b.__dask_layers__()
|
|
assert len(a.__dask_layers__()) == len(b.__dask_layers__())
|
|
assert a.__dask_graph__().keys() != b.__dask_graph__().keys()
|
|
assert len(a.__dask_graph__()) == len(b.__dask_graph__())
|
|
assert a.__dask_graph__().layers.keys() != b.__dask_graph__().layers.keys()
|
|
assert len(a.__dask_graph__().layers) == len(b.__dask_graph__().layers)
|
|
|
|
# Above we performed a slice operation; adding the two slices back together creates
|
|
# a diamond-shaped dependency graph, which in turn will trigger a collision in layer
|
|
# names if we were to use HighLevelGraph.cull() instead of
|
|
# HighLevelGraph.cull_layers() in Dataset.__dask_postpersist__().
|
|
assert_equal(ds2.d1 + ds2.d2, ds.d1 + ds.d2)
|
|
|
|
|
|
def test_new_index_var_computes_once():
|
|
# regression test for GH1533
|
|
data = dask.array.from_array(np.array([100, 200]))
|
|
with raise_if_dask_computes(max_computes=1):
|
|
Dataset(coords={"z": ("z", data)})
|
|
|
|
|
|
def test_minimize_graph_size():
|
|
# regression test for https://github.com/pydata/xarray/issues/8409
|
|
ds = Dataset(
|
|
{
|
|
"foo": (
|
|
("x", "y", "z"),
|
|
dask.array.ones((120, 120, 120), chunks=(20, 20, 1)),
|
|
)
|
|
},
|
|
coords={"x": np.arange(120), "y": np.arange(120), "z": np.arange(120)},
|
|
)
|
|
|
|
mapped = ds.map_blocks(lambda x: x)
|
|
graph = dict(mapped.__dask_graph__())
|
|
|
|
numchunks = {k: len(v) for k, v in ds.chunksizes.items()}
|
|
for var in "xyz":
|
|
actual = len([key for key in graph if var in key[0]])
|
|
# assert that we only include each chunk of an index variable
|
|
# is only included once, not the product of number of chunks of
|
|
# all the other dimensions.
|
|
# e.g. previously for 'x', actual == numchunks['y'] * numchunks['z']
|
|
assert actual == numchunks[var], (actual, numchunks[var])
|
|
|
|
|
|
def test_idxmin_chunking():
|
|
# GH9425
|
|
x, y, t = 100, 100, 10
|
|
rang = np.arange(t * x * y)
|
|
da = xr.DataArray(
|
|
rang.reshape(t, x, y), coords={"time": range(t), "x": range(x), "y": range(y)}
|
|
)
|
|
da = da.chunk(dict(time=-1, x=25, y=25))
|
|
actual = da.idxmin("time")
|
|
assert actual.chunksizes == {k: da.chunksizes[k] for k in ["x", "y"]}
|
|
assert_identical(actual, da.compute().idxmin("time"))
|