"""DatetimeIndex analog for cftime.datetime objects""" # The pandas.Index subclass defined here was copied and adapted for # use with cftime.datetime objects based on the source code defining # pandas.DatetimeIndex. # For reference, here is a copy of the pandas copyright notice: # (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team # All rights reserved. # Copyright (c) 2008-2011 AQR Capital Management, LLC # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of the copyright holder nor the names of any # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import annotations import math from datetime import timedelta from typing import TYPE_CHECKING, Any, Optional import numpy as np import pandas as pd from packaging.version import Version from xarray.coding.times import ( _STANDARD_CALENDARS, _parse_iso8601, cftime_to_nptime, infer_calendar_name, ) from xarray.core.common import _contains_cftime_datetimes from xarray.core.options import OPTIONS from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import attempt_import, emit_user_level_warning, is_scalar if TYPE_CHECKING: from xarray.coding.cftime_offsets import BaseCFTimeOffset from xarray.core.types import Self # constants for cftimeindex.repr CFTIME_REPR_LENGTH = 19 ITEMS_IN_REPR_MAX_ELSE_ELLIPSIS = 100 REPR_ELLIPSIS_SHOW_ITEMS_FRONT_END = 10 OUT_OF_BOUNDS_TIMEDELTA_ERRORS: tuple[type[Exception], ...] try: OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (pd.errors.OutOfBoundsTimedelta, OverflowError) except AttributeError: OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,) def _parsed_string_to_bounds(date_type, resolution, parsed): """Generalization of pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds for use with non-standard calendars and cftime.datetime objects. """ if resolution == "year": return ( date_type(parsed.year, 1, 1), date_type(parsed.year + 1, 1, 1) - timedelta(microseconds=1), ) elif resolution == "month": if parsed.month == 12: end = date_type(parsed.year + 1, 1, 1) - timedelta(microseconds=1) else: end = date_type(parsed.year, parsed.month + 1, 1) - timedelta( microseconds=1 ) return date_type(parsed.year, parsed.month, 1), end elif resolution == "day": start = date_type(parsed.year, parsed.month, parsed.day) return start, start + timedelta(days=1, microseconds=-1) elif resolution == "hour": start = date_type(parsed.year, parsed.month, parsed.day, parsed.hour) return start, start + timedelta(hours=1, microseconds=-1) elif resolution == "minute": start = date_type( parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute ) return start, start + timedelta(minutes=1, microseconds=-1) elif resolution == "second": start = date_type( parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute, parsed.second, ) return start, start + timedelta(seconds=1, microseconds=-1) else: raise KeyError def get_date_field(datetimes, field): """Adapted from pandas.tslib.get_date_field""" return np.array([getattr(date, field) for date in datetimes], dtype=np.int64) def _field_accessor(name, docstring=None, min_cftime_version="0.0"): """Adapted from pandas.tseries.index._field_accessor""" def f(self, min_cftime_version=min_cftime_version): if TYPE_CHECKING: import cftime else: cftime = attempt_import("cftime") if Version(cftime.__version__) >= Version(min_cftime_version): return get_date_field(self._data, name) else: raise ImportError( f"The {name:!r} accessor requires a minimum " f"version of cftime of {min_cftime_version}. Found an " f"installed version of {cftime.__version__}." ) f.__name__ = name f.__doc__ = docstring return property(f) def get_date_type(self): if self._data.size: return type(self._data[0]) else: return None def assert_all_valid_date_type(data): if TYPE_CHECKING: import cftime else: cftime = attempt_import("cftime") if len(data) > 0: sample = data[0] date_type = type(sample) if not isinstance(sample, cftime.datetime): raise TypeError( "CFTimeIndex requires cftime.datetime " f"objects. Got object of {date_type}." ) if not all(isinstance(value, date_type) for value in data): raise TypeError( "CFTimeIndex requires using datetime " f"objects of all the same type. Got\n{data}." ) def format_row(times, indent=0, separator=", ", row_end=",\n"): """Format a single row from format_times.""" return indent * " " + separator.join(map(str, times)) + row_end def format_times( index, max_width, offset, separator=", ", first_row_offset=0, intermediate_row_end=",\n", last_row_end="", ): """Format values of cftimeindex as pd.Index.""" n_per_row = max(max_width // (CFTIME_REPR_LENGTH + len(separator)), 1) n_rows = math.ceil(len(index) / n_per_row) representation = "" for row in range(n_rows): indent = first_row_offset if row == 0 else offset row_end = last_row_end if row == n_rows - 1 else intermediate_row_end times_for_row = index[row * n_per_row : (row + 1) * n_per_row] representation += format_row( times_for_row, indent=indent, separator=separator, row_end=row_end ) return representation def format_attrs(index, separator=", "): """Format attributes of CFTimeIndex for __repr__.""" attrs = { "dtype": f"'{index.dtype}'", "length": f"{len(index)}", "calendar": f"{index.calendar!r}", "freq": f"{index.freq!r}", } attrs_str = [f"{k}={v}" for k, v in attrs.items()] attrs_str = f"{separator}".join(attrs_str) return attrs_str class CFTimeIndex(pd.Index): """Custom Index for working with CF calendars and dates All elements of a CFTimeIndex must be cftime.datetime objects. Parameters ---------- data : array or CFTimeIndex Sequence of cftime.datetime objects to use in index name : str, default: None Name of the resulting index See Also -------- cftime_range """ _data: np.ndarray year = _field_accessor("year", "The year of the datetime") month = _field_accessor("month", "The month of the datetime") day = _field_accessor("day", "The days of the datetime") hour = _field_accessor("hour", "The hours of the datetime") minute = _field_accessor("minute", "The minutes of the datetime") second = _field_accessor("second", "The seconds of the datetime") microsecond = _field_accessor("microsecond", "The microseconds of the datetime") dayofyear = _field_accessor( "dayofyr", "The ordinal day of year of the datetime", "1.0.2.1" ) dayofweek = _field_accessor("dayofwk", "The day of week of the datetime", "1.0.2.1") days_in_month = _field_accessor( "daysinmonth", "The number of days in the month of the datetime", "1.1.0.0" ) date_type = property(get_date_type) def __new__(cls, data, name=None, **kwargs): assert_all_valid_date_type(data) if name is None and hasattr(data, "name"): name = data.name result = object.__new__(cls) result._data = np.array(data, dtype="O") result.name = name result._cache = {} return result def __repr__(self): """ Return a string representation for this object. """ klass_name = type(self).__name__ display_width = OPTIONS["display_width"] offset = len(klass_name) + 2 if len(self) <= ITEMS_IN_REPR_MAX_ELSE_ELLIPSIS: datastr = format_times( self.values, display_width, offset=offset, first_row_offset=0 ) else: front_str = format_times( self.values[:REPR_ELLIPSIS_SHOW_ITEMS_FRONT_END], display_width, offset=offset, first_row_offset=0, last_row_end=",", ) end_str = format_times( self.values[-REPR_ELLIPSIS_SHOW_ITEMS_FRONT_END:], display_width, offset=offset, first_row_offset=offset, ) datastr = "\n".join([front_str, f"{' '*offset}...", end_str]) attrs_str = format_attrs(self) # oneliner only if smaller than display_width full_repr_str = f"{klass_name}([{datastr}], {attrs_str})" if len(full_repr_str) > display_width: # if attrs_str too long, one per line if len(attrs_str) >= display_width - offset: attrs_str = attrs_str.replace(",", f",\n{' '*(offset-2)}") full_repr_str = f"{klass_name}([{datastr}],\n{' '*(offset-1)}{attrs_str})" return full_repr_str def _partial_date_slice(self, resolution, parsed): """Adapted from pandas.tseries.index.DatetimeIndex._partial_date_slice Note that when using a CFTimeIndex, if a partial-date selection returns a single element, it will never be converted to a scalar coordinate; this is in slight contrast to the behavior when using a DatetimeIndex, which sometimes will return a DataArray with a scalar coordinate depending on the resolution of the datetimes used in defining the index. For example: >>> from cftime import DatetimeNoLeap >>> da = xr.DataArray( ... [1, 2], ... coords=[[DatetimeNoLeap(2001, 1, 1), DatetimeNoLeap(2001, 2, 1)]], ... dims=["time"], ... ) >>> da.sel(time="2001-01-01") Size: 8B array([1]) Coordinates: * time (time) object 8B 2001-01-01 00:00:00 >>> da = xr.DataArray( ... [1, 2], ... coords=[[pd.Timestamp(2001, 1, 1), pd.Timestamp(2001, 2, 1)]], ... dims=["time"], ... ) >>> da.sel(time="2001-01-01") Size: 8B array(1) Coordinates: time datetime64[ns] 8B 2001-01-01 >>> da = xr.DataArray( ... [1, 2], ... coords=[[pd.Timestamp(2001, 1, 1, 1), pd.Timestamp(2001, 2, 1)]], ... dims=["time"], ... ) >>> da.sel(time="2001-01-01") Size: 8B array([1]) Coordinates: * time (time) datetime64[ns] 8B 2001-01-01T01:00:00 """ start, end = _parsed_string_to_bounds(self.date_type, resolution, parsed) times = self._data if self.is_monotonic_increasing: if len(times) and ( (start < times[0] and end < times[0]) or (start > times[-1] and end > times[-1]) ): # we are out of range raise KeyError # a monotonic (sorted) series can be sliced left = times.searchsorted(start, side="left") right = times.searchsorted(end, side="right") return slice(left, right) lhs_mask = times >= start rhs_mask = times <= end return np.flatnonzero(lhs_mask & rhs_mask) def _get_string_slice(self, key): """Adapted from pandas.tseries.index.DatetimeIndex._get_string_slice""" parsed, resolution = _parse_iso8601(self.date_type, key) try: loc = self._partial_date_slice(resolution, parsed) except KeyError as err: raise KeyError(key) from err return loc def _get_nearest_indexer(self, target, limit, tolerance): """Adapted from pandas.Index._get_nearest_indexer""" left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) left_distances = abs(self.values[left_indexer] - target.values) right_distances = abs(self.values[right_indexer] - target.values) if self.is_monotonic_increasing: condition = (left_distances < right_distances) | (right_indexer == -1) else: condition = (left_distances <= right_distances) | (right_indexer == -1) indexer = np.where(condition, left_indexer, right_indexer) if tolerance is not None: indexer = self._filter_indexer_tolerance(target, indexer, tolerance) return indexer def _filter_indexer_tolerance(self, target, indexer, tolerance): """Adapted from pandas.Index._filter_indexer_tolerance""" if isinstance(target, pd.Index): distance = abs(self.values[indexer] - target.values) else: distance = abs(self.values[indexer] - target) indexer = np.where(distance <= tolerance, indexer, -1) return indexer def get_loc(self, key): """Adapted from pandas.tseries.index.DatetimeIndex.get_loc""" if isinstance(key, str): return self._get_string_slice(key) else: return super().get_loc(key) def _maybe_cast_slice_bound(self, label, side): """Adapted from pandas.tseries.index.DatetimeIndex._maybe_cast_slice_bound """ if not isinstance(label, str): return label parsed, resolution = _parse_iso8601(self.date_type, label) start, end = _parsed_string_to_bounds(self.date_type, resolution, parsed) if self.is_monotonic_decreasing and len(self) > 1: return end if side == "left" else start return start if side == "left" else end # TODO: Add ability to use integer range outside of iloc? # e.g. series[1:5]. def get_value(self, series, key): """Adapted from pandas.tseries.index.DatetimeIndex.get_value""" if np.asarray(key).dtype == np.dtype(bool): return series.iloc[key] elif isinstance(key, slice): return series.iloc[self.slice_indexer(key.start, key.stop, key.step)] else: return series.iloc[self.get_loc(key)] def __contains__(self, key: Any) -> bool: """Adapted from pandas.tseries.base.DatetimeIndexOpsMixin.__contains__""" try: result = self.get_loc(key) return ( is_scalar(result) or isinstance(result, slice) or (isinstance(result, np.ndarray) and result.size > 0) ) except (KeyError, TypeError, ValueError): return False def contains(self, key: Any) -> bool: """Needed for .loc based partial-string indexing""" return self.__contains__(key) def shift( # type: ignore[override] # freq is typed Any, we are more precise self, periods: int | float, freq: str | timedelta | BaseCFTimeOffset | None = None, ) -> Self: """Shift the CFTimeIndex a multiple of the given frequency. See the documentation for :py:func:`~xarray.cftime_range` for a complete listing of valid frequency strings. Parameters ---------- periods : int, float if freq of days or below Periods to shift by freq : str, datetime.timedelta or BaseCFTimeOffset A frequency string or datetime.timedelta object to shift by Returns ------- CFTimeIndex See Also -------- pandas.DatetimeIndex.shift Examples -------- >>> index = xr.cftime_range("2000", periods=1, freq="ME") >>> index CFTimeIndex([2000-01-31 00:00:00], dtype='object', length=1, calendar='standard', freq=None) >>> index.shift(1, "ME") CFTimeIndex([2000-02-29 00:00:00], dtype='object', length=1, calendar='standard', freq=None) >>> index.shift(1.5, "D") CFTimeIndex([2000-02-01 12:00:00], dtype='object', length=1, calendar='standard', freq=None) """ from xarray.coding.cftime_offsets import BaseCFTimeOffset if freq is None: # None type is required to be compatible with base pd.Index class raise TypeError( f"`freq` argument cannot be None for {type(self).__name__}.shift" ) if isinstance(freq, timedelta): return self + periods * freq if isinstance(freq, str | BaseCFTimeOffset): from xarray.coding.cftime_offsets import to_offset return self + periods * to_offset(freq) raise TypeError( f"'freq' must be of type str or datetime.timedelta, got {type(freq)}." ) def __add__(self, other) -> Self: if isinstance(other, pd.TimedeltaIndex): other = other.to_pytimedelta() return type(self)(np.array(self) + other) def __radd__(self, other) -> Self: if isinstance(other, pd.TimedeltaIndex): other = other.to_pytimedelta() return type(self)(other + np.array(self)) def __sub__(self, other): if _contains_datetime_timedeltas(other): return type(self)(np.array(self) - other) if isinstance(other, pd.TimedeltaIndex): return type(self)(np.array(self) - other.to_pytimedelta()) if _contains_cftime_datetimes(np.array(other)): try: return pd.TimedeltaIndex(np.array(self) - np.array(other)) except OUT_OF_BOUNDS_TIMEDELTA_ERRORS as err: raise ValueError( "The time difference exceeds the range of values " "that can be expressed at the nanosecond resolution." ) from err return NotImplemented def __rsub__(self, other): try: return pd.TimedeltaIndex(other - np.array(self)) except OUT_OF_BOUNDS_TIMEDELTA_ERRORS as err: raise ValueError( "The time difference exceeds the range of values " "that can be expressed at the nanosecond resolution." ) from err def to_datetimeindex( self, unsafe: bool = False, time_unit: Optional[PDDatetimeUnitOptions] = None ) -> pd.DatetimeIndex: """If possible, convert this index to a pandas.DatetimeIndex. Parameters ---------- unsafe : bool Flag to turn off calendar mismatch warnings (default ``False``). time_unit : str Time resolution of resulting DatetimeIndex. Can be one of `"s"`, ``"ms"``, ``"us"``, or ``"ns"`` (default ``"ns"``). Returns ------- pandas.DatetimeIndex Raises ------ ValueError If the CFTimeIndex contains dates that are not possible in the standard calendar or outside the range representable by the specified ``time_unit``. Warns ----- RuntimeWarning If converting from a non-standard calendar, or a Gregorian calendar with dates prior to the reform (1582-10-15). Warnings -------- Note that for non-proleptic Gregorian calendars, this will change the calendar type of the index. In that case the result of this method should be used with caution. Examples -------- >>> times = xr.cftime_range("2000", periods=2, calendar="gregorian") >>> times CFTimeIndex([2000-01-01 00:00:00, 2000-01-02 00:00:00], dtype='object', length=2, calendar='standard', freq=None) >>> times.to_datetimeindex(time_unit="ns") DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[ns]', freq=None) """ if not self._data.size: return pd.DatetimeIndex([]) if time_unit is None: emit_user_level_warning( "In a future version of xarray to_datetimeindex will default " "to returning a 'us'-resolution DatetimeIndex instead of a " "'ns'-resolution DatetimeIndex. This warning can be silenced " "by explicitly passing the `time_unit` keyword argument.", FutureWarning, ) time_unit = "ns" nptimes = cftime_to_nptime(self, time_unit=time_unit) calendar = infer_calendar_name(self) if calendar not in _STANDARD_CALENDARS and not unsafe: emit_user_level_warning( "Converting a CFTimeIndex with dates from a non-standard " f"calendar, {calendar!r}, to a pandas.DatetimeIndex, which " "uses dates from the standard calendar. This may lead to " "subtle errors in operations that depend on the length of " "time between dates.", RuntimeWarning, ) if calendar == "standard" and not unsafe: reform_date = self.date_type(1582, 10, 15) if self.min() < reform_date: emit_user_level_warning( "Converting a CFTimeIndex with dates from a Gregorian " "calendar that fall before the reform date of 1582-10-15 " "to a pandas.DatetimeIndex. During this time period the " "Gregorian calendar and the proleptic Gregorian calendar " "of the DatetimeIndex do not exactly align. This warning " "can be silenced by setting unsafe=True.", RuntimeWarning, ) return pd.DatetimeIndex(nptimes) def strftime(self, date_format): """ Return an Index of formatted strings specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in `python string format doc `__ Parameters ---------- date_format : str Date format string (e.g. "%Y-%m-%d") Returns ------- pandas.Index Index of formatted strings Examples -------- >>> rng = xr.cftime_range( ... start="2000", periods=5, freq="2MS", calendar="noleap" ... ) >>> rng.strftime("%B %d, %Y, %r") Index(['January 01, 2000, 12:00:00 AM', 'March 01, 2000, 12:00:00 AM', 'May 01, 2000, 12:00:00 AM', 'July 01, 2000, 12:00:00 AM', 'September 01, 2000, 12:00:00 AM'], dtype='object') """ return pd.Index([date.strftime(date_format) for date in self._data]) @property def asi8(self): """Convert to integers with units of microseconds since 1970-01-01.""" from xarray.core.resample_cftime import exact_cftime_datetime_difference if not self._data.size: return np.array([], dtype=np.int64) epoch = self.date_type(1970, 1, 1) return np.array( [ _total_microseconds(exact_cftime_datetime_difference(epoch, date)) for date in self.values ], dtype=np.int64, ) @property def calendar(self): """The calendar used by the datetimes in the index.""" from xarray.coding.times import infer_calendar_name if not self._data.size: return None return infer_calendar_name(self) @property def freq(self): """The frequency used by the dates in the index.""" from xarray.coding.frequencies import infer_freq # min 3 elemtents required to determine freq if self._data.size < 3: return None return infer_freq(self) def _round_via_method(self, freq, method): """Round dates using a specified method.""" from xarray.coding.cftime_offsets import CFTIME_TICKS, to_offset if not self._data.size: return CFTimeIndex(np.array(self)) offset = to_offset(freq) if not isinstance(offset, CFTIME_TICKS): raise ValueError(f"{offset} is a non-fixed frequency") unit = _total_microseconds(offset.as_timedelta()) values = self.asi8 rounded = method(values, unit) return _cftimeindex_from_i8(rounded, self.date_type, self.name) def floor(self, freq): """Round dates down to fixed frequency. Parameters ---------- freq : str The frequency level to round the index to. Must be a fixed frequency like 'S' (second) not 'ME' (month end). See `frequency aliases `_ for a list of possible values. Returns ------- CFTimeIndex """ return self._round_via_method(freq, _floor_int) def ceil(self, freq): """Round dates up to fixed frequency. Parameters ---------- freq : str The frequency level to round the index to. Must be a fixed frequency like 'S' (second) not 'ME' (month end). See `frequency aliases `_ for a list of possible values. Returns ------- CFTimeIndex """ return self._round_via_method(freq, _ceil_int) def round(self, freq): """Round dates to a fixed frequency. Parameters ---------- freq : str The frequency level to round the index to. Must be a fixed frequency like 'S' (second) not 'ME' (month end). See `frequency aliases `_ for a list of possible values. Returns ------- CFTimeIndex """ return self._round_via_method(freq, _round_to_nearest_half_even) @property def is_leap_year(self): if TYPE_CHECKING: import cftime else: cftime = attempt_import("cftime") func = np.vectorize(cftime.is_leap_year) return func(self.year, calendar=self.calendar) def _parse_array_of_cftime_strings(strings, date_type): """Create a numpy array from an array of strings. For use in generating dates from strings for use with interp. Assumes the array is either 0-dimensional or 1-dimensional. Parameters ---------- strings : array of strings Strings to convert to dates date_type : cftime.datetime type Calendar type to use for dates Returns ------- np.array """ return np.array([_parse_iso8601(date_type, s)[0] for s in strings.ravel()]).reshape( strings.shape ) def _contains_datetime_timedeltas(array): """Check if an input array contains datetime.timedelta objects.""" array = np.atleast_1d(array) return isinstance(array[0], timedelta) def _cftimeindex_from_i8(values, date_type, name): """Construct a CFTimeIndex from an array of integers. Parameters ---------- values : np.array Integers representing microseconds since 1970-01-01. date_type : cftime.datetime Type of date for the index. name : str Name of the index. Returns ------- CFTimeIndex """ epoch = date_type(1970, 1, 1) dates = np.array([epoch + timedelta(microseconds=int(value)) for value in values]) return CFTimeIndex(dates, name=name) def _total_microseconds(delta): """Compute the total number of microseconds of a datetime.timedelta. Parameters ---------- delta : datetime.timedelta Input timedelta. Returns ------- int """ return delta / timedelta(microseconds=1) def _floor_int(values, unit): """Copied from pandas.""" return values - np.remainder(values, unit) def _ceil_int(values, unit): """Copied from pandas.""" return values + np.remainder(-values, unit) def _round_to_nearest_half_even(values, unit): """Copied from pandas.""" if unit % 2: return _ceil_int(values - unit // 2, unit) quotient, remainder = np.divmod(values, unit) mask = np.logical_or( remainder > (unit // 2), np.logical_and(remainder == (unit // 2), quotient % 2) ) quotient[mask] += 1 return quotient * unit