feat: support mixed indices and column names for dtypes and column se…

…lection (#206) * refactor: moved excelsheet to a subdirectory Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * feat: support mixed indices and column names for dtypes and column selection Work items: * Introduced a new type providing information about a column: `ColumnInfo` * `ExcelSheet.selected_columns` now returns a list of ColumnInfo for all selected columns * `ExcelSheet.available_columns` now returns a list of ColumnInfo for all available columns * A lot of to/from python code has been wrapped in traits to have less manual validation closes #198 --------- Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
ToucanToco · Mar 25, 2024 · f3c92d6 · f3c92d6
1 parent a4a7f3e
commit f3c92d6
Show file tree

Hide file tree

Showing 13 changed files with 848 additions and 614 deletions.
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -18,6 +18,7 @@
     CalamineCellError,
     CalamineError,
     CannotRetrieveCellDataError,
+    ColumnInfo,
     ColumnNotFoundError,
     FastExcelError,
     InvalidParametersError,
@@ -30,7 +31,9 @@
 from ._fastexcel import read_excel as _read_excel
 
 DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
-DTypeMap: TypeAlias = "dict[str, DType] | dict[int, DType]"
+DTypeMap: TypeAlias = "dict[str | int, DType]"
+ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"]
+DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"]
 
 
 class ExcelSheet:
@@ -60,12 +63,12 @@ def total_height(self) -> int:
         return self._sheet.total_height
 
     @property
-    def selected_columns(self) -> list[str] | list[int] | None:
+    def selected_columns(self) -> list[ColumnInfo]:
         """The sheet's selected columns"""
         return self._sheet.selected_columns
 
     @property
-    def available_columns(self) -> list[str]:
+    def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given sheet"""
         return self._sheet.available_columns
 
@@ -141,13 +144,12 @@ def load_sheet(
                                    If `None`, all rows will be used.
         :param use_columns: Specifies the columns to use. Can either be:
                             - `None` to select all columns
-                            - a list of strings, the column names
-                            - a list of ints, the column indices (starting at 0)
-                            - a string, a comma separated list of Excel column letters and column
+                            - A list of strings and ints, the column names and/or indices
+                              (starting at 0)
+                            - A string, a comma separated list of Excel column letters and column
                               ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
                               `A,B,C,D,E` and `A,C,E,F`)
-        :param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns`
-                       is a list of ints or an Excel range), or column names
+        :param dtypes: An optional dict of dtypes. Keys can be column indices or names
         """
         return ExcelSheet(
             self._reader.load_sheet(
@@ -235,10 +237,22 @@ def read_excel(source: Path | str | bytes) -> ExcelReader:
 
 
 __all__ = (
+    ## version
     "__version__",
+    ## main entrypoint
     "read_excel",
+    ## Python types
+    "DType",
+    "DTypeMap",
+    # Excel reader
     "ExcelReader",
+    # Excel sheet
     "ExcelSheet",
+    # Column metadata
+    "DTypeFrom",
+    "ColumnNameFrom",
+    "ColumnInfo",
+    # Exceptions
     "FastExcelError",
     "CannotRetrieveCellDataError",
     "CalamineCellError",

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -4,9 +4,31 @@ from typing import Literal
 
 import pyarrow as pa
 
-_DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
+DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
+DTypeMap = dict[str | int, DType]
+ColumnNameFrom = Literal["provided", "looked_up", "generated"]
+DTypeFrom = Literal["provided_by_index", "provided_by_name", "guessed"]
 
-_DTypeMap = dict[str, _DType] | dict[int, _DType]
+class ColumnInfo:
+    def __init__(
+        self,
+        *,
+        name: str,
+        index: int,
+        column_name_from: ColumnNameFrom,
+        dtype: DType,
+        dtype_from: DTypeFrom,
+    ) -> None: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def index(self) -> int: ...
+    @property
+    def dtype(self) -> DType: ...
+    @property
+    def column_name_from(self) -> ColumnNameFrom: ...
+    @property
+    def dtype_from(self) -> DTypeFrom: ...
 
 class _ExcelSheet:
     @property
@@ -25,13 +47,13 @@ class _ExcelSheet:
     def offset(self) -> int:
         """The sheet's offset before data starts"""
     @property
-    def selected_columns(self) -> list[str] | list[int] | None:
+    def selected_columns(self) -> list[ColumnInfo]:
         """The sheet's selected columns"""
     @property
-    def available_columns(self) -> list[str]:
+    def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given sheet"""
     @property
-    def specified_dtypes(self) -> _DTypeMap | None:
+    def specified_dtypes(self) -> DTypeMap | None:
         """The dtypes specified for the sheet"""
     def to_arrow(self) -> pa.RecordBatch:
         """Converts the sheet to a pyarrow `RecordBatch`"""
@@ -49,7 +71,7 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
-        dtypes: _DTypeMap | None = None,
+        dtypes: DTypeMap | None = None,
     ) -> _ExcelSheet: ...
     @property
     def sheet_names(self) -> list[str]: ...

diff --git a/python/tests/test_alias_generation.py b/python/tests/test_alias_generation.py
@@ -9,14 +9,16 @@
 from utils import path_for_fixture
 
 
-@pytest.mark.parametrize("use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"]])
+@pytest.mark.parametrize(
+    "use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"], [0, "col_1", 2]]
+)
 def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None:
     excel_reader = fastexcel.read_excel(
         path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx")
     )
 
     sheet = excel_reader.load_sheet(0, use_columns=use_columns)
-    assert sheet.available_columns == ["col", "col_1", "col_2"]
+    assert [col.name for col in sheet.available_columns] == ["col", "col_1", "col_2"]
 
     pd_assert_frame_equal(
         sheet.to_pandas(),