CCR/.venv/lib/python3.12/site-packages/pysd/py_backend/data.py

import warnings
import re
import random
from pathlib import Path

import numpy as np
import xarray as xr
import pandas as pd

from .utils import load_outputs


class Columns():
    """
    Class to save the read columns in data files
    """
    _files = {}

    @classmethod
    def read(cls, file_name, encoding=None):
        """
        Read the columns from the data file or return the previously read ones
        """
        file_name = Path(file_name)
        if file_name in cls._files:
            return cls._files[file_name]
        else:
            columns = cls.read_file(file_name, encoding)
            cls._files[file_name] = columns
            return columns

    @classmethod
    def read_file(cls, file_name, encoding=None):
        """
        Get the columns from an output csv or tab file.

        Parameters
        ----------
        file_name: str
            Output file to read. Must be csv or tab.

        encoding: str or None (optional)
            Encoding type to read output file. Needed if the file has special
            characters. Default is None.

        Returns
        -------
        out, transposed: list, bool
            The list of the columns in the output file and a boolean flag
            to indicate if the output file is transposed.

        """
        # in the most cases variables will be split per columns, then
        # read the first row to have all the column names
        out = cls.read_line(file_name, encoding)
        if out is None:
            raise ValueError(
                f"\nNot able to read '{str(file_name)}'. "
                + "Only '.csv', '.tab' files are accepted.")

        transpose = False

        try:
            # if we fail converting columns to float then they are
            # not numeric values, so current direction is okay
            [float(col) for col in random.sample(out, min(3, len(out)))]
            # we did not fail, read the first column to see if variables
            # are split per rows
            out = cls.read_col(file_name, encoding)
            transpose = True
            # if we still are able to transform values to float the
            # file is not valid
            [float(col) for col in random.sample(out, min(3, len(out)))]
        except ValueError:
            return out, transpose
        else:
            raise ValueError(
                f"Invalid file format '{str(file_name)}'... varible names "
                "should appear in the first row or in the first column...")

    @classmethod
    def read_line(cls, file_name, encoding=None):
        """
        Read the firts row and return a set of it.
        """
        if file_name.suffix.lower() == ".tab":
            return list(pd.read_table(file_name,
                                      nrows=0,
                                      encoding=encoding,
                                      dtype=str,
                                      header=0).iloc[:, 1:])
        elif file_name.suffix.lower() == ".csv":
            return list(pd.read_csv(file_name,
                                    nrows=0,
                                    encoding=encoding,
                                    dtype=str,
                                    header=0).iloc[:, 1:])
        else:
            return None

    @classmethod
    def read_col(cls, file_name, encoding=None):
        """
        Read the firts column and return a it.
        """
        if file_name.suffix.lower() == ".tab":
            return list(pd.read_table(file_name,
                                      usecols=[0],
                                      encoding=encoding,
                                      dtype=str).iloc[:, 0].to_list())
        elif file_name.suffix.lower() == ".csv":
            return list(pd.read_csv(file_name,
                                    usecols=[0],
                                    encoding=encoding,
                                    dtype=str).iloc[:, 0].to_list())

    @classmethod
    def get_columns(cls, file_name, vars=None, encoding=None):
        """
        Get columns names from a tab or csv file and return those that
        match with the given ones.

        Parameters
        ----------
        file_name: str
            Output file to read. Must be csv or tab.

        vars: list
            List of var names to find in the file.

        encoding: str or None (optional)
            Encoding type to read output file. Needed if the file has special
            characters. Default is None.

        Return
        ------
        columns, transpose: set, bool
            The set of columns as they are named in the input file and a
            boolean flag to indicate if the input file is transposed or
            not.

        """
        if vars is None:
            # Not var specified, return all available variables
            return cls.read(file_name, encoding)

        columns, transpose = cls.read(file_name, encoding)

        vars_extended = []
        for var in vars:
            vars_extended.append(var)
            if var.startswith('"') and var.endswith('"'):
                # the variables in "" are reded without " by pandas
                vars_extended.append(var[1:-1])

        outs = set()
        for var in columns:
            if var in vars_extended:
                # var is in vars_extended (no subscripts)
                outs.add(var)
                vars_extended.remove(var)
            else:
                for var1 in vars_extended:
                    if var.startswith(var1 + "["):
                        # var is subscripted
                        outs.add(var)

        return outs, transpose

    @classmethod
    def clean(cls):
        """
        Clean the dictionary of read files
        """
        cls._files = {}


class Data(object):
    # TODO add __init__ and use this class for used input pandas.Series
    # as Data
    # def __init__(self, data, coords, interp="interpolate"):

    def set_values(self, values):
        """Set new values from user input"""
        self.data = xr.DataArray(
            np.nan, self.final_coords, list(self.final_coords))

        if isinstance(values, pd.Series):
            index = list(values.index)
            index.sort()
            self.data = self.data.expand_dims(
                {'time': index}, axis=0).copy()

            for index, value in values.items():
                if isinstance(values.values[0], xr.DataArray):
                    self.data.loc[index].loc[value.coords] = value
                else:
                    self.data.loc[index] = value
        else:
            if isinstance(values, xr.DataArray):
                self.data.loc[values.coords] = values.values
            else:
                if self.final_coords:
                    self.data.loc[:] = values
                else:
                    self.data = values

    def __call__(self, time):
        try:
            if time in self.data['time'].values:
                outdata = self.data.sel(time=time)
            elif self.interp == "raw":
                return self.nan
            elif time > self.data['time'].values[-1]:
                warnings.warn(
                    self.py_name + "\n"
                    + "extrapolating data above the maximum value of the time")
                outdata = self.data[-1]
            elif time < self.data['time'].values[0]:
                warnings.warn(
                    self.py_name + "\n"
                    + "extrapolating data below the minimum value of the time")
                outdata = self.data[0]
            elif self.interp == "interpolate":
                outdata = self.data.interp(time=time)
            elif self.interp == 'look_forward':
                outdata = self.data.sel(time=time, method="backfill")
            elif self.interp == 'hold_backward':
                outdata = self.data.sel(time=time, method="pad")

            if self.is_float:
                # if data has no-coords return a float
                return float(outdata)
            else:
                # Remove time coord from the DataArray
                return outdata.reset_coords('time', drop=True)
        except (TypeError, KeyError):
            if self.data is None:
                raise ValueError(
                    self.py_name + "\n"
                    "Trying to interpolate data variable before loading"
                    " the data...")

            # this except catch the errors when a data has been
            # changed to a constant value by the user
            return self.data
        except Exception as err:
            raise err


class TabData(Data):
    """
    Data from tabular file tab/csv, it could be from Vensim output.
    """
    def __init__(self, real_name, py_name, coords, interp="interpolate"):
        self.real_name = real_name
        self.py_name = py_name
        self.coords = coords
        self.final_coords = coords
        self.interp = interp.replace(" ", "_") if interp else None
        self.is_float = not bool(coords)
        self.data = None

        if self.interp not in ["interpolate", "raw",
                               "look_forward", "hold_backward"]:
            raise ValueError(self.py_name + "\n"
                             + "The interpolation method (interp) must be "
                             + "'raw', 'interpolate', "
                             + "'look_forward' or 'hold_backward'")

    def load_data(self, file_names):
        """
        Load data values from files.

        Parameters
        ----------
        file_names: list or str or pathlib.Path
            Name of the files to search the variable in.

        Returns
        -------
        out: xarray.DataArray
            Resulting data array with the time in the first dimension.

        """
        if isinstance(file_names, (str, Path)):
            file_names = [file_names]

        for file_name in file_names:
            self.data = self._load_data(Path(file_name))
            if self.data is not None:
                break

        if self.data is None:
            raise ValueError(
                f"_data_{self.py_name}\n"
                f"Data for {self.real_name} not found in "
                f"{', '.join([str(file_name) for file_name in file_names])}")

    def _load_data(self, file_name):
        """
        Load data values from output

        Parameters
        ----------
        file_name: pathlib.Path
            Name of the file to search the variable in.

        Returns
        -------
        out: xarray.DataArray or None
            Resulting data array with the time in the first dimension.

        """
        # TODO inlcude missing values managment as External objects
        # get columns to load variable
        if file_name.suffix in [".csv", ".tab"]:

            columns, transpose = Columns.get_columns(
                file_name, vars=[self.real_name, self.py_name])

            if not columns:
                # the variable is not in the passed file
                return None

            if not self.coords:
                # 0 dimensional data
                self.nan = np.nan
                values = load_outputs(file_name, transpose, columns=columns)
                return xr.DataArray(
                    values.iloc[:, 0].values,
                    {'time': values.index.values},
                    ['time'])

            # subscripted data
            dims = list(self.coords)

            values = load_outputs(file_name, transpose, columns=columns)

            self.nan = xr.DataArray(np.nan, self.coords, dims)
            out = xr.DataArray(
                np.nan,
                {'time': values.index.values, **self.coords},
                ['time'] + dims)

            for column in values.columns:
                coords = {
                    dim: [coord]
                    for (dim, coord)
                    in zip(dims, re.split(r'\[|\]|\s*,\s*', column)[1:-1])
                }
                out.loc[coords] = np.expand_dims(
                    values[column].values,
                    axis=tuple(range(1, len(coords)+1))
                )

            return out

        ds = xr.open_dataset(file_name)

        if self.py_name in ds:
            data = ds[self.py_name]
            ds.close()

            if (
                "time" in data.dims
                and list(self.coords).sort() == list(data.dims[1:]).sort()
            ):
                return data

        ds.close()
        return None