CCR/.venv/lib/python3.12/site-packages/xarray/util/generate_aggregations.py

"""Generate module and stub file for arithmetic operators of various xarray classes.

For internal xarray development use only.

Usage:
    python xarray/util/generate_aggregations.py
    pytest --doctest-modules xarray/{core,namedarray}/_aggregations.py --accept || true
    pytest --doctest-modules xarray/{core,namedarray}/_aggregations.py

This requires [pytest-accept](https://github.com/max-sixty/pytest-accept).
The second run of pytest is deliberate, since the first will return an error
while replacing the doctests.

"""

import collections
import textwrap
from dataclasses import dataclass, field

MODULE_PREAMBLE = '''\
"""Mixin classes with reduction operations."""

# This file was generated using xarray.util.generate_aggregations. Do not edit manually.

from __future__ import annotations

from collections.abc import Callable, Sequence
from typing import TYPE_CHECKING, Any

from xarray.core import duck_array_ops
from xarray.core.options import OPTIONS
from xarray.core.types import Dims, Self
from xarray.core.utils import contains_only_chunked_or_numpy, module_available

if TYPE_CHECKING:
    from xarray.core.dataarray import DataArray
    from xarray.core.dataset import Dataset

flox_available = module_available("flox")
'''

NAMED_ARRAY_MODULE_PREAMBLE = '''\
"""Mixin classes with reduction operations."""
# This file was generated using xarray.util.generate_aggregations. Do not edit manually.

from __future__ import annotations

from collections.abc import Callable, Sequence
from typing import Any

from xarray.core import duck_array_ops
from xarray.core.types import Dims, Self
'''

AGGREGATIONS_PREAMBLE = """

class {obj}{cls}Aggregations:
    __slots__ = ()

    def reduce(
        self,
        func: Callable[..., Any],
        dim: Dims = None,
        *,
        axis: int | Sequence[int] | None = None,
        keep_attrs: bool | None = None,
        keepdims: bool = False,
        **kwargs: Any,
    ) -> Self:
        raise NotImplementedError()"""

NAMED_ARRAY_AGGREGATIONS_PREAMBLE = """

class {obj}{cls}Aggregations:
    __slots__ = ()

    def reduce(
        self,
        func: Callable[..., Any],
        dim: Dims = None,
        *,
        axis: int | Sequence[int] | None = None,
        keepdims: bool = False,
        **kwargs: Any,
    ) -> Self:
        raise NotImplementedError()"""


GROUPBY_PREAMBLE = """

class {obj}{cls}Aggregations:
    _obj: {obj}

    def reduce(
        self,
        func: Callable[..., Any],
        dim: Dims = None,
        *,
        axis: int | Sequence[int] | None = None,
        keep_attrs: bool | None = None,
        keepdims: bool = False,
        **kwargs: Any,
    ) -> {obj}:
        raise NotImplementedError()

    def _flox_reduce(
        self,
        dim: Dims,
        **kwargs: Any,
    ) -> {obj}:
        raise NotImplementedError()"""

RESAMPLE_PREAMBLE = """

class {obj}{cls}Aggregations:
    _obj: {obj}

    def reduce(
        self,
        func: Callable[..., Any],
        dim: Dims = None,
        *,
        axis: int | Sequence[int] | None = None,
        keep_attrs: bool | None = None,
        keepdims: bool = False,
        **kwargs: Any,
    ) -> {obj}:
        raise NotImplementedError()

    def _flox_reduce(
        self,
        dim: Dims,
        **kwargs: Any,
    ) -> {obj}:
        raise NotImplementedError()"""

TEMPLATE_REDUCTION_SIGNATURE = '''
    def {method}(
        self,
        dim: Dims = None,{kw_only}{extra_kwargs}{keep_attrs}
        **kwargs: Any,
    ) -> Self:
        """
        Reduce this {obj}'s data by applying ``{method}`` along some dimension(s).

        Parameters
        ----------'''

TEMPLATE_REDUCTION_SIGNATURE_GROUPBY = '''
    def {method}(
        self,
        dim: Dims = None,
        *,{extra_kwargs}
        keep_attrs: bool | None = None,
        **kwargs: Any,
    ) -> {obj}:
        """
        Reduce this {obj}'s data by applying ``{method}`` along some dimension(s).

        Parameters
        ----------'''

TEMPLATE_RETURNS = """
        Returns
        -------
        reduced : {obj}
            New {obj} with ``{method}`` applied to its data and the
            indicated dimension(s) removed"""

TEMPLATE_SEE_ALSO = """
        See Also
        --------
{see_also_methods}
        :ref:`{docref}`
            User guide on {docref_description}."""

TEMPLATE_NOTES = """
        Notes
        -----
{notes}"""

_DIM_DOCSTRING = """dim : str, Iterable of Hashable, "..." or None, default: None
    Name of dimension[s] along which to apply ``{method}``. For e.g. ``dim="x"``
    or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions."""

_DIM_DOCSTRING_GROUPBY = """dim : str, Iterable of Hashable, "..." or None, default: None
    Name of dimension[s] along which to apply ``{method}``. For e.g. ``dim="x"``
    or ``dim=["x", "y"]``. If None, will reduce over the {cls} dimensions.
    If "...", will reduce over all dimensions."""

_SKIPNA_DOCSTRING = """skipna : bool or None, optional
    If True, skip missing values (as marked by NaN). By default, only
    skips missing values for float dtypes; other dtypes either do not
    have a sentinel missing value (int) or ``skipna=True`` has not been
    implemented (object, datetime64 or timedelta64)."""

_MINCOUNT_DOCSTRING = """min_count : int or None, optional
    The required number of valid values to perform the operation. If
    fewer than min_count non-NA values are present the result will be
    NA. Only used if skipna is set to True or defaults to True for the
    array's dtype. Changed in version 0.17.0: if specified on an integer
    array and skipna=True, the result will be a float array."""

_DDOF_DOCSTRING = """ddof : int, default: 0
    “Delta Degrees of Freedom”: the divisor used in the calculation is ``N - ddof``,
    where ``N`` represents the number of elements."""

_KEEP_ATTRS_DOCSTRING = """keep_attrs : bool or None, optional
    If True, ``attrs`` will be copied from the original
    object to the new one.  If False, the new object will be
    returned without attributes."""

_KWARGS_DOCSTRING = """**kwargs : Any
    Additional keyword arguments passed on to the appropriate array
    function for calculating ``{method}`` on this object's data.
    These could include dask-specific kwargs like ``split_every``."""

_NUMERIC_ONLY_NOTES = "Non-numeric variables will be removed prior to reducing."

_FLOX_NOTES_TEMPLATE = """Use the ``flox`` package to significantly speed up {kind} computations,
especially with dask arrays. Xarray will use flox by default if installed.
Pass flox-specific keyword arguments in ``**kwargs``.
See the `flox documentation <https://flox.readthedocs.io>`_ for more."""
_FLOX_GROUPBY_NOTES = _FLOX_NOTES_TEMPLATE.format(kind="groupby")
_FLOX_RESAMPLE_NOTES = _FLOX_NOTES_TEMPLATE.format(kind="resampling")
_CUM_NOTES = """Note that the methods on the ``cumulative`` method are more performant (with numbagg installed)
and better supported. ``cumsum`` and ``cumprod`` may be deprecated
in the future."""

ExtraKwarg = collections.namedtuple("ExtraKwarg", "docs kwarg call example")
skipna = ExtraKwarg(
    docs=_SKIPNA_DOCSTRING,
    kwarg="skipna: bool | None = None,",
    call="skipna=skipna,",
    example="""\n
        Use ``skipna`` to control whether NaNs are ignored.

        >>> {calculation}(skipna=False)""",
)
min_count = ExtraKwarg(
    docs=_MINCOUNT_DOCSTRING,
    kwarg="min_count: int | None = None,",
    call="min_count=min_count,",
    example="""\n
        Specify ``min_count`` for finer control over when NaNs are ignored.

        >>> {calculation}(skipna=True, min_count=2)""",
)
ddof = ExtraKwarg(
    docs=_DDOF_DOCSTRING,
    kwarg="ddof: int = 0,",
    call="ddof=ddof,",
    example="""\n
        Specify ``ddof=1`` for an unbiased estimate.

        >>> {calculation}(skipna=True, ddof=1)""",
)


@dataclass
class DataStructure:
    name: str
    create_example: str
    example_var_name: str
    numeric_only: bool = False
    see_also_modules: tuple[str, ...] = tuple


class Method:
    def __init__(
        self,
        name,
        bool_reduce=False,
        extra_kwargs=tuple(),
        numeric_only=False,
        see_also_modules=("numpy", "dask.array"),
        see_also_methods=(),
        min_flox_version=None,
        additional_notes="",
    ):
        self.name = name
        self.extra_kwargs = extra_kwargs
        self.numeric_only = numeric_only
        self.see_also_modules = see_also_modules
        self.see_also_methods = see_also_methods
        self.min_flox_version = min_flox_version
        self.additional_notes = additional_notes
        if bool_reduce:
            self.array_method = f"array_{name}"
            self.np_example_array = (
                """np.array([True, True, True, True, True, False], dtype=bool)"""
            )

        else:
            self.array_method = name
            self.np_example_array = """np.array([1, 2, 3, 0, 2, np.nan])"""


@dataclass
class AggregationGenerator:
    _dim_docstring = _DIM_DOCSTRING
    _template_signature = TEMPLATE_REDUCTION_SIGNATURE

    cls: str
    datastructure: DataStructure
    methods: tuple[Method, ...]
    docref: str
    docref_description: str
    example_call_preamble: str
    definition_preamble: str
    has_keep_attrs: bool = True
    notes: str = ""
    preamble: str = field(init=False)

    def __post_init__(self):
        self.preamble = self.definition_preamble.format(
            obj=self.datastructure.name, cls=self.cls
        )

    def generate_methods(self):
        yield [self.preamble]
        for method in self.methods:
            yield self.generate_method(method)

    def generate_method(self, method: Method):
        has_kw_only = method.extra_kwargs or self.has_keep_attrs

        template_kwargs = dict(
            obj=self.datastructure.name,
            method=method.name,
            keep_attrs=(
                "\n        keep_attrs: bool | None = None,"
                if self.has_keep_attrs
                else ""
            ),
            kw_only="\n        *," if has_kw_only else "",
        )

        if method.extra_kwargs:
            extra_kwargs = "\n        " + "\n        ".join(
                [kwarg.kwarg for kwarg in method.extra_kwargs if kwarg.kwarg]
            )
        else:
            extra_kwargs = ""

        yield self._template_signature.format(
            **template_kwargs,
            extra_kwargs=extra_kwargs,
        )

        for text in [
            self._dim_docstring.format(method=method.name, cls=self.cls),
            *(kwarg.docs for kwarg in method.extra_kwargs if kwarg.docs),
            _KEEP_ATTRS_DOCSTRING if self.has_keep_attrs else None,
            _KWARGS_DOCSTRING.format(method=method.name),
        ]:
            if text:
                yield textwrap.indent(text, 8 * " ")

        yield TEMPLATE_RETURNS.format(**template_kwargs)

        # we want Dataset.count to refer to DataArray.count
        # but we also want DatasetGroupBy.count to refer to Dataset.count
        # The generic aggregations have self.cls == ''
        others = (
            self.datastructure.see_also_modules
            if self.cls == ""
            else (self.datastructure.name,)
        )
        see_also_methods_from_modules = (
            " " * 8 + f"{mod}.{method.name}"
            for mod in (method.see_also_modules + others)
        )
        see_also_methods_from_methods = (
            " " * 8 + f"{self.datastructure.name}.{method}"
            for method in method.see_also_methods
        )
        see_also_methods = "\n".join(
            [*see_also_methods_from_modules, *see_also_methods_from_methods]
        )
        # Fixes broken links mentioned in #8055
        yield TEMPLATE_SEE_ALSO.format(
            **template_kwargs,
            docref=self.docref,
            docref_description=self.docref_description,
            see_also_methods=see_also_methods,
        )

        notes = self.notes
        if method.numeric_only:
            if notes != "":
                notes += "\n\n"
            notes += _NUMERIC_ONLY_NOTES

        if method.additional_notes:
            if notes != "":
                notes += "\n\n"
            notes += method.additional_notes

        if notes != "":
            yield TEMPLATE_NOTES.format(notes=textwrap.indent(notes, 8 * " "))

        yield textwrap.indent(self.generate_example(method=method), "")
        yield '        """'

        yield self.generate_code(method, self.has_keep_attrs)

    def generate_example(self, method):
        created = self.datastructure.create_example.format(
            example_array=method.np_example_array
        )
        calculation = f"{self.datastructure.example_var_name}{self.example_call_preamble}.{method.name}"
        if method.extra_kwargs:
            extra_examples = "".join(
                kwarg.example for kwarg in method.extra_kwargs if kwarg.example
            ).format(calculation=calculation, method=method.name)
        else:
            extra_examples = ""

        return f"""
        Examples
        --------{created}
        >>> {self.datastructure.example_var_name}

        >>> {calculation}(){extra_examples}"""


class GroupByAggregationGenerator(AggregationGenerator):
    _dim_docstring = _DIM_DOCSTRING_GROUPBY
    _template_signature = TEMPLATE_REDUCTION_SIGNATURE_GROUPBY

    def generate_code(self, method, has_keep_attrs):
        extra_kwargs = [kwarg.call for kwarg in method.extra_kwargs if kwarg.call]

        if self.datastructure.numeric_only:
            extra_kwargs.append(f"numeric_only={method.numeric_only},")

        # median isn't enabled yet, because it would break if a single group was present in multiple
        # chunks. The non-flox code path will just rechunk every group to a single chunk and execute the median
        method_is_not_flox_supported = method.name in ("median", "cumsum", "cumprod")
        if method_is_not_flox_supported:
            indent = 12
        else:
            indent = 16

        if extra_kwargs:
            extra_kwargs = textwrap.indent("\n" + "\n".join(extra_kwargs), indent * " ")
        else:
            extra_kwargs = ""

        if method_is_not_flox_supported:
            return f"""\
        return self.reduce(
            duck_array_ops.{method.array_method},
            dim=dim,{extra_kwargs}
            keep_attrs=keep_attrs,
            **kwargs,
        )"""

        min_version_check = f"""
            and module_available("flox", minversion="{method.min_flox_version}")"""

        return (
            """\
        if (
            flox_available
            and OPTIONS["use_flox"]"""
            + (min_version_check if method.min_flox_version is not None else "")
            + f"""
            and contains_only_chunked_or_numpy(self._obj)
        ):
            return self._flox_reduce(
                func="{method.name}",
                dim=dim,{extra_kwargs}
                # fill_value=fill_value,
                keep_attrs=keep_attrs,
                **kwargs,
            )
        else:
            return self.reduce(
                duck_array_ops.{method.array_method},
                dim=dim,{extra_kwargs}
                keep_attrs=keep_attrs,
                **kwargs,
            )"""
        )


class GenericAggregationGenerator(AggregationGenerator):
    def generate_code(self, method, has_keep_attrs):
        extra_kwargs = [kwarg.call for kwarg in method.extra_kwargs if kwarg.call]

        if self.datastructure.numeric_only:
            extra_kwargs.append(f"numeric_only={method.numeric_only},")

        if extra_kwargs:
            extra_kwargs = textwrap.indent("\n" + "\n".join(extra_kwargs), 12 * " ")
        else:
            extra_kwargs = ""
        keep_attrs = (
            "\n" + 12 * " " + "keep_attrs=keep_attrs," if has_keep_attrs else ""
        )
        return f"""\
        return self.reduce(
            duck_array_ops.{method.array_method},
            dim=dim,{extra_kwargs}{keep_attrs}
            **kwargs,
        )"""


AGGREGATION_METHODS = (
    # Reductions:
    Method("count", see_also_modules=("pandas.DataFrame", "dask.dataframe.DataFrame")),
    Method("all", bool_reduce=True),
    Method("any", bool_reduce=True),
    Method("max", extra_kwargs=(skipna,)),
    Method("min", extra_kwargs=(skipna,)),
    Method("mean", extra_kwargs=(skipna,), numeric_only=True),
    Method("prod", extra_kwargs=(skipna, min_count), numeric_only=True),
    Method("sum", extra_kwargs=(skipna, min_count), numeric_only=True),
    Method("std", extra_kwargs=(skipna, ddof), numeric_only=True),
    Method("var", extra_kwargs=(skipna, ddof), numeric_only=True),
    Method(
        "median", extra_kwargs=(skipna,), numeric_only=True, min_flox_version="0.9.2"
    ),
    # Cumulatives:
    Method(
        "cumsum",
        extra_kwargs=(skipna,),
        numeric_only=True,
        see_also_methods=("cumulative",),
        additional_notes=_CUM_NOTES,
    ),
    Method(
        "cumprod",
        extra_kwargs=(skipna,),
        numeric_only=True,
        see_also_methods=("cumulative",),
        additional_notes=_CUM_NOTES,
    ),
)


DATATREE_OBJECT = DataStructure(
    name="DataTree",
    create_example="""
        >>> dt = xr.DataTree(
        ...     xr.Dataset(
        ...         data_vars=dict(foo=("time", {example_array})),
        ...         coords=dict(
        ...             time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)),
        ...             labels=("time", np.array(["a", "b", "c", "c", "b", "a"])),
        ...         ),
        ...     ),
        ... )""",
    example_var_name="dt",
    numeric_only=True,
    see_also_modules=("Dataset", "DataArray"),
)
DATASET_OBJECT = DataStructure(
    name="Dataset",
    create_example="""
        >>> da = xr.DataArray(
        ...     {example_array},
        ...     dims="time",
        ...     coords=dict(
        ...         time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)),
        ...         labels=("time", np.array(["a", "b", "c", "c", "b", "a"])),
        ...     ),
        ... )
        >>> ds = xr.Dataset(dict(da=da))""",
    example_var_name="ds",
    numeric_only=True,
    see_also_modules=("DataArray",),
)
DATAARRAY_OBJECT = DataStructure(
    name="DataArray",
    create_example="""
        >>> da = xr.DataArray(
        ...     {example_array},
        ...     dims="time",
        ...     coords=dict(
        ...         time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)),
        ...         labels=("time", np.array(["a", "b", "c", "c", "b", "a"])),
        ...     ),
        ... )""",
    example_var_name="da",
    numeric_only=False,
    see_also_modules=("Dataset",),
)
DATATREE_GENERATOR = GenericAggregationGenerator(
    cls="",
    datastructure=DATATREE_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="agg",
    docref_description="reduction or aggregation operations",
    example_call_preamble="",
    definition_preamble=AGGREGATIONS_PREAMBLE,
)
DATASET_GENERATOR = GenericAggregationGenerator(
    cls="",
    datastructure=DATASET_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="agg",
    docref_description="reduction or aggregation operations",
    example_call_preamble="",
    definition_preamble=AGGREGATIONS_PREAMBLE,
)
DATAARRAY_GENERATOR = GenericAggregationGenerator(
    cls="",
    datastructure=DATAARRAY_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="agg",
    docref_description="reduction or aggregation operations",
    example_call_preamble="",
    definition_preamble=AGGREGATIONS_PREAMBLE,
)
DATAARRAY_GROUPBY_GENERATOR = GroupByAggregationGenerator(
    cls="GroupBy",
    datastructure=DATAARRAY_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="groupby",
    docref_description="groupby operations",
    example_call_preamble='.groupby("labels")',
    definition_preamble=GROUPBY_PREAMBLE,
    notes=_FLOX_GROUPBY_NOTES,
)
DATAARRAY_RESAMPLE_GENERATOR = GroupByAggregationGenerator(
    cls="Resample",
    datastructure=DATAARRAY_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="resampling",
    docref_description="resampling operations",
    example_call_preamble='.resample(time="3ME")',
    definition_preamble=RESAMPLE_PREAMBLE,
    notes=_FLOX_RESAMPLE_NOTES,
)
DATASET_GROUPBY_GENERATOR = GroupByAggregationGenerator(
    cls="GroupBy",
    datastructure=DATASET_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="groupby",
    docref_description="groupby operations",
    example_call_preamble='.groupby("labels")',
    definition_preamble=GROUPBY_PREAMBLE,
    notes=_FLOX_GROUPBY_NOTES,
)
DATASET_RESAMPLE_GENERATOR = GroupByAggregationGenerator(
    cls="Resample",
    datastructure=DATASET_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="resampling",
    docref_description="resampling operations",
    example_call_preamble='.resample(time="3ME")',
    definition_preamble=RESAMPLE_PREAMBLE,
    notes=_FLOX_RESAMPLE_NOTES,
)

NAMED_ARRAY_OBJECT = DataStructure(
    name="NamedArray",
    create_example="""
        >>> from xarray.namedarray.core import NamedArray
        >>> na = NamedArray(
        ...     "x", {example_array}
        ... )""",
    example_var_name="na",
    numeric_only=False,
    see_also_modules=("Dataset", "DataArray"),
)

NAMED_ARRAY_GENERATOR = GenericAggregationGenerator(
    cls="",
    datastructure=NAMED_ARRAY_OBJECT,
    methods=AGGREGATION_METHODS,
    docref="agg",
    docref_description="reduction or aggregation operations",
    example_call_preamble="",
    definition_preamble=NAMED_ARRAY_AGGREGATIONS_PREAMBLE,
    has_keep_attrs=False,
)


def write_methods(filepath, generators, preamble):
    with open(filepath, mode="w", encoding="utf-8") as f:
        f.write(preamble)
        for gen in generators:
            for lines in gen.generate_methods():
                for line in lines:
                    f.write(line + "\n")


if __name__ == "__main__":
    import os
    from pathlib import Path

    p = Path(os.getcwd())
    write_methods(
        filepath=p.parent / "xarray" / "xarray" / "core" / "_aggregations.py",
        generators=[
            DATATREE_GENERATOR,
            DATASET_GENERATOR,
            DATAARRAY_GENERATOR,
            DATASET_GROUPBY_GENERATOR,
            DATASET_RESAMPLE_GENERATOR,
            DATAARRAY_GROUPBY_GENERATOR,
            DATAARRAY_RESAMPLE_GENERATOR,
        ],
        preamble=MODULE_PREAMBLE,
    )
    write_methods(
        filepath=p.parent / "xarray" / "xarray" / "namedarray" / "_aggregations.py",
        generators=[NAMED_ARRAY_GENERATOR],
        preamble=NAMED_ARRAY_MODULE_PREAMBLE,
    )
    # filepath = p.parent / "core" / "_aggregations.py"  # Run from script location