CCR/.venv/lib/python3.12/site-packages/python_calamine/pandas.py

137 lines
4.5 KiB
Python

from __future__ import annotations
from datetime import date, datetime, time, timedelta
from importlib.metadata import version
from typing import TYPE_CHECKING, Union, cast
import pandas as pd
from packaging.version import Version, parse
from pandas._typing import Scalar
from pandas.compat._optional import import_optional_dependency
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel import ExcelFile
from pandas.io.excel._base import ( # type:ignore[attr-defined] # missing in pandas-stubs
BaseExcelReader,
)
from pandas.util._decorators import ( # type:ignore[attr-defined] # missing in pandas-stubs
doc,
)
if TYPE_CHECKING:
from pandas._typing import FilePath, ReadBuffer, StorageOptions
from python_calamine import CalamineSheet, CalamineWorkbook
_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta]
PANDAS_VERSION = parse(version("pandas"))
class CalamineExcelReader(BaseExcelReader):
book: CalamineWorkbook
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using calamine engine (xlsx/xls/xlsb/ods).
Parameters
----------
filepath_or_buffer : str, path to be parsed or
an open readable stream.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("python_calamine")
if PANDAS_VERSION >= Version("2.2.0"):
raise ValueError("Pandas >= 2.2.0 has builtin support of calamine")
elif PANDAS_VERSION >= Version("2.1.0"):
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
elif PANDAS_VERSION >= Version("2.0.0"):
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
)
else:
raise ValueError("Pandas >= 2 is only supported")
@property
def _workbook_class(self) -> type[CalamineWorkbook]:
from python_calamine import CalamineWorkbook
return CalamineWorkbook
def load_workbook(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
engine_kwargs: dict | None = None,
) -> CalamineWorkbook:
from python_calamine import load_workbook
return load_workbook(filepath_or_buffer, **(engine_kwargs or {}))
@property
def sheet_names(self) -> list[str]:
from python_calamine import SheetTypeEnum
return [
sheet.name
for sheet in self.book.sheets_metadata
if sheet.typ == SheetTypeEnum.WorkSheet
]
def get_sheet_by_name(self, name: str) -> CalamineSheet:
self.raise_if_bad_sheet_by_name(name)
return self.book.get_sheet_by_name(name)
def get_sheet_by_index(self, index: int) -> CalamineSheet:
self.raise_if_bad_sheet_by_index(index)
return self.book.get_sheet_by_index(index)
def get_sheet_data(
self, sheet: CalamineSheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
def _convert_cell(value: _CellValueT) -> Scalar:
if isinstance(value, float):
val = int(value)
if val == value:
return val
else:
return value
elif isinstance(value, date):
return pd.Timestamp(value)
elif isinstance(value, timedelta):
return pd.Timedelta(value)
elif isinstance(value, time):
# cast needed here because Scalar doesn't include datetime.time
return cast(Scalar, value)
return value
rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False)
data: list[list[Scalar]] = []
for row in rows:
data.append([_convert_cell(cell) for cell in row])
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
return data
def pandas_monkeypatch() -> None:
ExcelFile._engines = { # type:ignore[attr-defined] # missing in pandas-stubs
"calamine": CalamineExcelReader,
**ExcelFile._engines, # type:ignore[attr-defined] # missing in pandas-stubs
}