CCR/.venv/lib/python3.12/site-packages/xarray/backends/netcdf3.py

172 lines
5.6 KiB
Python

from __future__ import annotations
import unicodedata
import numpy as np
from xarray import coding
from xarray.core.variable import Variable
# Special characters that are permitted in netCDF names except in the
# 0th position of the string
_specialchars = '_.@+- !"#$%&\\()*,:;<=>?[]^`{|}~'
# The following are reserved names in CDL and may not be used as names of
# variables, dimension, attributes
_reserved_names = {
"byte",
"char",
"short",
"ushort",
"int",
"uint",
"int64",
"uint64",
"float",
"real",
"double",
"bool",
"string",
}
# These data-types aren't supported by netCDF3, so they are automatically
# coerced instead as indicated by the "coerce_nc3_dtype" function
_nc3_dtype_coercions = {
"int64": "int32",
"uint64": "int32",
"uint32": "int32",
"uint16": "int16",
"uint8": "int8",
"bool": "int8",
}
# encode all strings as UTF-8
STRING_ENCODING = "utf-8"
COERCION_VALUE_ERROR = (
"could not safely cast array from {dtype} to {new_dtype}. While it is not "
"always the case, a common reason for this is that xarray has deemed it "
"safest to encode np.datetime64[ns] or np.timedelta64[ns] values with "
"int64 values representing units of 'nanoseconds'. This is either due to "
"the fact that the times are known to require nanosecond precision for an "
"accurate round trip, or that the times are unknown prior to writing due "
"to being contained in a chunked array. Ways to work around this are "
"either to use a backend that supports writing int64 values, or to "
"manually specify the encoding['units'] and encoding['dtype'] (e.g. "
"'seconds since 1970-01-01' and np.dtype('int32')) on the time "
"variable(s) such that the times can be serialized in a netCDF3 file "
"(note that depending on the situation, however, this latter option may "
"result in an inaccurate round trip)."
)
def coerce_nc3_dtype(arr):
"""Coerce an array to a data type that can be stored in a netCDF-3 file
This function performs the dtype conversions as specified by the
``_nc3_dtype_coercions`` mapping:
int64 -> int32
uint64 -> int32
uint32 -> int32
uint16 -> int16
uint8 -> int8
bool -> int8
Data is checked for equality, or equivalence (non-NaN values) using the
``(cast_array == original_array).all()``.
"""
dtype = str(arr.dtype)
if dtype in _nc3_dtype_coercions:
new_dtype = _nc3_dtype_coercions[dtype]
# TODO: raise a warning whenever casting the data-type instead?
cast_arr = arr.astype(new_dtype)
if not (cast_arr == arr).all():
raise ValueError(
COERCION_VALUE_ERROR.format(dtype=dtype, new_dtype=new_dtype)
)
arr = cast_arr
return arr
def encode_nc3_attr_value(value):
if isinstance(value, bytes):
pass
elif isinstance(value, str):
value = value.encode(STRING_ENCODING)
else:
value = coerce_nc3_dtype(np.atleast_1d(value))
if value.ndim > 1:
raise ValueError("netCDF attributes must be 1-dimensional")
return value
def encode_nc3_attrs(attrs):
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}
def _maybe_prepare_times(var):
# checks for integer-based time-like and
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
# this keeps backwards compatibility
data = var.data
if data.dtype.kind in "iu":
units = var.attrs.get("units", None)
if units is not None:
if coding.variables._is_time_like(units):
mask = data == np.iinfo(np.int64).min
if mask.any():
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
return data
def encode_nc3_variable(var):
for coder in [
coding.strings.EncodedStringCoder(allows_unicode=False),
coding.strings.CharacterArrayCoder(),
]:
var = coder.encode(var)
data = _maybe_prepare_times(var)
data = coerce_nc3_dtype(data)
attrs = encode_nc3_attrs(var.attrs)
return Variable(var.dims, data, attrs, var.encoding)
def _isalnumMUTF8(c):
"""Return True if the given UTF-8 encoded character is alphanumeric
or multibyte.
Input is not checked!
"""
return c.isalnum() or (len(c.encode("utf-8")) > 1)
def is_valid_nc3_name(s):
"""Test whether an object can be validly converted to a netCDF-3
dimension, variable or attribute name
Earlier versions of the netCDF C-library reference implementation
enforced a more restricted set of characters in creating new names,
but permitted reading names containing arbitrary bytes. This
specification extends the permitted characters in names to include
multi-byte UTF-8 encoded Unicode and additional printing characters
from the US-ASCII alphabet. The first character of a name must be
alphanumeric, a multi-byte UTF-8 character, or '_' (reserved for
special names with meaning to implementations, such as the
"_FillValue" attribute). Subsequent characters may also include
printing special characters, except for '/' which is not allowed in
names. Names that have trailing space characters are also not
permitted.
"""
if not isinstance(s, str):
return False
num_bytes = len(s.encode("utf-8"))
return (
(unicodedata.normalize("NFC", s) == s)
and (s not in _reserved_names)
and (num_bytes >= 0)
and ("/" not in s)
and (s[-1] != " ")
and (_isalnumMUTF8(s[0]) or (s[0] == "_"))
and all(_isalnumMUTF8(c) or c in _specialchars for c in s)
)