172 lines
5.6 KiB
Python
172 lines
5.6 KiB
Python
from __future__ import annotations
|
|
|
|
import unicodedata
|
|
|
|
import numpy as np
|
|
|
|
from xarray import coding
|
|
from xarray.core.variable import Variable
|
|
|
|
# Special characters that are permitted in netCDF names except in the
|
|
# 0th position of the string
|
|
_specialchars = '_.@+- !"#$%&\\()*,:;<=>?[]^`{|}~'
|
|
|
|
# The following are reserved names in CDL and may not be used as names of
|
|
# variables, dimension, attributes
|
|
_reserved_names = {
|
|
"byte",
|
|
"char",
|
|
"short",
|
|
"ushort",
|
|
"int",
|
|
"uint",
|
|
"int64",
|
|
"uint64",
|
|
"float",
|
|
"real",
|
|
"double",
|
|
"bool",
|
|
"string",
|
|
}
|
|
|
|
# These data-types aren't supported by netCDF3, so they are automatically
|
|
# coerced instead as indicated by the "coerce_nc3_dtype" function
|
|
_nc3_dtype_coercions = {
|
|
"int64": "int32",
|
|
"uint64": "int32",
|
|
"uint32": "int32",
|
|
"uint16": "int16",
|
|
"uint8": "int8",
|
|
"bool": "int8",
|
|
}
|
|
|
|
# encode all strings as UTF-8
|
|
STRING_ENCODING = "utf-8"
|
|
COERCION_VALUE_ERROR = (
|
|
"could not safely cast array from {dtype} to {new_dtype}. While it is not "
|
|
"always the case, a common reason for this is that xarray has deemed it "
|
|
"safest to encode np.datetime64[ns] or np.timedelta64[ns] values with "
|
|
"int64 values representing units of 'nanoseconds'. This is either due to "
|
|
"the fact that the times are known to require nanosecond precision for an "
|
|
"accurate round trip, or that the times are unknown prior to writing due "
|
|
"to being contained in a chunked array. Ways to work around this are "
|
|
"either to use a backend that supports writing int64 values, or to "
|
|
"manually specify the encoding['units'] and encoding['dtype'] (e.g. "
|
|
"'seconds since 1970-01-01' and np.dtype('int32')) on the time "
|
|
"variable(s) such that the times can be serialized in a netCDF3 file "
|
|
"(note that depending on the situation, however, this latter option may "
|
|
"result in an inaccurate round trip)."
|
|
)
|
|
|
|
|
|
def coerce_nc3_dtype(arr):
|
|
"""Coerce an array to a data type that can be stored in a netCDF-3 file
|
|
|
|
This function performs the dtype conversions as specified by the
|
|
``_nc3_dtype_coercions`` mapping:
|
|
int64 -> int32
|
|
uint64 -> int32
|
|
uint32 -> int32
|
|
uint16 -> int16
|
|
uint8 -> int8
|
|
bool -> int8
|
|
|
|
Data is checked for equality, or equivalence (non-NaN values) using the
|
|
``(cast_array == original_array).all()``.
|
|
"""
|
|
dtype = str(arr.dtype)
|
|
if dtype in _nc3_dtype_coercions:
|
|
new_dtype = _nc3_dtype_coercions[dtype]
|
|
# TODO: raise a warning whenever casting the data-type instead?
|
|
cast_arr = arr.astype(new_dtype)
|
|
if not (cast_arr == arr).all():
|
|
raise ValueError(
|
|
COERCION_VALUE_ERROR.format(dtype=dtype, new_dtype=new_dtype)
|
|
)
|
|
arr = cast_arr
|
|
return arr
|
|
|
|
|
|
def encode_nc3_attr_value(value):
|
|
if isinstance(value, bytes):
|
|
pass
|
|
elif isinstance(value, str):
|
|
value = value.encode(STRING_ENCODING)
|
|
else:
|
|
value = coerce_nc3_dtype(np.atleast_1d(value))
|
|
if value.ndim > 1:
|
|
raise ValueError("netCDF attributes must be 1-dimensional")
|
|
return value
|
|
|
|
|
|
def encode_nc3_attrs(attrs):
|
|
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}
|
|
|
|
|
|
def _maybe_prepare_times(var):
|
|
# checks for integer-based time-like and
|
|
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
|
|
# this keeps backwards compatibility
|
|
|
|
data = var.data
|
|
if data.dtype.kind in "iu":
|
|
units = var.attrs.get("units", None)
|
|
if units is not None:
|
|
if coding.variables._is_time_like(units):
|
|
mask = data == np.iinfo(np.int64).min
|
|
if mask.any():
|
|
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
|
|
return data
|
|
|
|
|
|
def encode_nc3_variable(var):
|
|
for coder in [
|
|
coding.strings.EncodedStringCoder(allows_unicode=False),
|
|
coding.strings.CharacterArrayCoder(),
|
|
]:
|
|
var = coder.encode(var)
|
|
data = _maybe_prepare_times(var)
|
|
data = coerce_nc3_dtype(data)
|
|
attrs = encode_nc3_attrs(var.attrs)
|
|
return Variable(var.dims, data, attrs, var.encoding)
|
|
|
|
|
|
def _isalnumMUTF8(c):
|
|
"""Return True if the given UTF-8 encoded character is alphanumeric
|
|
or multibyte.
|
|
|
|
Input is not checked!
|
|
"""
|
|
return c.isalnum() or (len(c.encode("utf-8")) > 1)
|
|
|
|
|
|
def is_valid_nc3_name(s):
|
|
"""Test whether an object can be validly converted to a netCDF-3
|
|
dimension, variable or attribute name
|
|
|
|
Earlier versions of the netCDF C-library reference implementation
|
|
enforced a more restricted set of characters in creating new names,
|
|
but permitted reading names containing arbitrary bytes. This
|
|
specification extends the permitted characters in names to include
|
|
multi-byte UTF-8 encoded Unicode and additional printing characters
|
|
from the US-ASCII alphabet. The first character of a name must be
|
|
alphanumeric, a multi-byte UTF-8 character, or '_' (reserved for
|
|
special names with meaning to implementations, such as the
|
|
"_FillValue" attribute). Subsequent characters may also include
|
|
printing special characters, except for '/' which is not allowed in
|
|
names. Names that have trailing space characters are also not
|
|
permitted.
|
|
"""
|
|
if not isinstance(s, str):
|
|
return False
|
|
num_bytes = len(s.encode("utf-8"))
|
|
return (
|
|
(unicodedata.normalize("NFC", s) == s)
|
|
and (s not in _reserved_names)
|
|
and (num_bytes >= 0)
|
|
and ("/" not in s)
|
|
and (s[-1] != " ")
|
|
and (_isalnumMUTF8(s[0]) or (s[0] == "_"))
|
|
and all(_isalnumMUTF8(c) or c in _specialchars for c in s)
|
|
)
|