Skip to content

Commit

Permalink
feat: support mixed indices and column names for dtypes and column se…
Browse files Browse the repository at this point in the history
…lection (#206)

* refactor: moved excelsheet to a subdirectory

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* feat: support mixed indices and column names for dtypes and column selection

Work items:

* Introduced a new type providing information about a column: `ColumnInfo`
* `ExcelSheet.selected_columns` now returns a list of ColumnInfo for all
  selected columns
* `ExcelSheet.available_columns` now returns a list of ColumnInfo for all
  available columns
* A lot of to/from python code has been wrapped in traits to have less
  manual validation

closes #198

---------

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
  • Loading branch information
lukapeschke authored Mar 25, 2024
1 parent a4a7f3e commit f3c92d6
Show file tree
Hide file tree
Showing 13 changed files with 848 additions and 614 deletions.
30 changes: 22 additions & 8 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
CalamineCellError,
CalamineError,
CannotRetrieveCellDataError,
ColumnInfo,
ColumnNotFoundError,
FastExcelError,
InvalidParametersError,
Expand All @@ -30,7 +31,9 @@
from ._fastexcel import read_excel as _read_excel

DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap: TypeAlias = "dict[str, DType] | dict[int, DType]"
DTypeMap: TypeAlias = "dict[str | int, DType]"
ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"]
DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"]


class ExcelSheet:
Expand Down Expand Up @@ -60,12 +63,12 @@ def total_height(self) -> int:
return self._sheet.total_height

@property
def selected_columns(self) -> list[str] | list[int] | None:
def selected_columns(self) -> list[ColumnInfo]:
"""The sheet's selected columns"""
return self._sheet.selected_columns

@property
def available_columns(self) -> list[str]:
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given sheet"""
return self._sheet.available_columns

Expand Down Expand Up @@ -141,13 +144,12 @@ def load_sheet(
If `None`, all rows will be used.
:param use_columns: Specifies the columns to use. Can either be:
- `None` to select all columns
- a list of strings, the column names
- a list of ints, the column indices (starting at 0)
- a string, a comma separated list of Excel column letters and column
- A list of strings and ints, the column names and/or indices
(starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
`A,B,C,D,E` and `A,C,E,F`)
:param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns`
is a list of ints or an Excel range), or column names
:param dtypes: An optional dict of dtypes. Keys can be column indices or names
"""
return ExcelSheet(
self._reader.load_sheet(
Expand Down Expand Up @@ -235,10 +237,22 @@ def read_excel(source: Path | str | bytes) -> ExcelReader:


__all__ = (
## version
"__version__",
## main entrypoint
"read_excel",
## Python types
"DType",
"DTypeMap",
# Excel reader
"ExcelReader",
# Excel sheet
"ExcelSheet",
# Column metadata
"DTypeFrom",
"ColumnNameFrom",
"ColumnInfo",
# Exceptions
"FastExcelError",
"CannotRetrieveCellDataError",
"CalamineCellError",
Expand Down
34 changes: 28 additions & 6 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,31 @@ from typing import Literal

import pyarrow as pa

_DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap = dict[str | int, DType]
ColumnNameFrom = Literal["provided", "looked_up", "generated"]
DTypeFrom = Literal["provided_by_index", "provided_by_name", "guessed"]

_DTypeMap = dict[str, _DType] | dict[int, _DType]
class ColumnInfo:
def __init__(
self,
*,
name: str,
index: int,
column_name_from: ColumnNameFrom,
dtype: DType,
dtype_from: DTypeFrom,
) -> None: ...
@property
def name(self) -> str: ...
@property
def index(self) -> int: ...
@property
def dtype(self) -> DType: ...
@property
def column_name_from(self) -> ColumnNameFrom: ...
@property
def dtype_from(self) -> DTypeFrom: ...

class _ExcelSheet:
@property
Expand All @@ -25,13 +47,13 @@ class _ExcelSheet:
def offset(self) -> int:
"""The sheet's offset before data starts"""
@property
def selected_columns(self) -> list[str] | list[int] | None:
def selected_columns(self) -> list[ColumnInfo]:
"""The sheet's selected columns"""
@property
def available_columns(self) -> list[str]:
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given sheet"""
@property
def specified_dtypes(self) -> _DTypeMap | None:
def specified_dtypes(self) -> DTypeMap | None:
"""The dtypes specified for the sheet"""
def to_arrow(self) -> pa.RecordBatch:
"""Converts the sheet to a pyarrow `RecordBatch`"""
Expand All @@ -49,7 +71,7 @@ class _ExcelReader:
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: _DTypeMap | None = None,
dtypes: DTypeMap | None = None,
) -> _ExcelSheet: ...
@property
def sheet_names(self) -> list[str]: ...
Expand Down
6 changes: 4 additions & 2 deletions python/tests/test_alias_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@
from utils import path_for_fixture


@pytest.mark.parametrize("use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"]])
@pytest.mark.parametrize(
"use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"], [0, "col_1", 2]]
)
def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None:
excel_reader = fastexcel.read_excel(
path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx")
)

sheet = excel_reader.load_sheet(0, use_columns=use_columns)
assert sheet.available_columns == ["col", "col_1", "col_2"]
assert [col.name for col in sheet.available_columns] == ["col", "col_1", "col_2"]

pd_assert_frame_equal(
sheet.to_pandas(),
Expand Down
Loading

0 comments on commit f3c92d6

Please sign in to comment.