From f3c92d65189d66d317cf0dc834b4efef23009a4a Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 25 Mar 2024 09:37:02 +0100 Subject: [PATCH] feat: support mixed indices and column names for dtypes and column selection (#206) * refactor: moved excelsheet to a subdirectory Signed-off-by: Luka Peschke * feat: support mixed indices and column names for dtypes and column selection Work items: * Introduced a new type providing information about a column: `ColumnInfo` * `ExcelSheet.selected_columns` now returns a list of ColumnInfo for all selected columns * `ExcelSheet.available_columns` now returns a list of ColumnInfo for all available columns * A lot of to/from python code has been wrapped in traits to have less manual validation closes #198 --------- Signed-off-by: Luka Peschke --- python/fastexcel/__init__.py | 30 +- python/fastexcel/_fastexcel.pyi | 34 +- python/tests/test_alias_generation.py | 6 +- python/tests/test_column_selection.py | 123 ++++-- python/tests/test_dtypes.py | 2 +- src/lib.rs | 4 +- src/types/dtype.rs | 269 ++++++++---- src/types/idx_or_name.rs | 45 +- src/types/python/excelreader.rs | 15 +- src/types/python/excelsheet/column_info.rs | 273 ++++++++++++ .../{excelsheet.rs => excelsheet/mod.rs} | 390 +++++++++--------- src/utils/arrow.rs | 270 ------------ src/utils/mod.rs | 1 - 13 files changed, 848 insertions(+), 614 deletions(-) create mode 100644 src/types/python/excelsheet/column_info.rs rename src/types/python/{excelsheet.rs => excelsheet/mod.rs} (70%) delete mode 100644 src/utils/arrow.rs delete mode 100644 src/utils/mod.rs diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 749939a..f8e2c33 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -18,6 +18,7 @@ CalamineCellError, CalamineError, CannotRetrieveCellDataError, + ColumnInfo, ColumnNotFoundError, FastExcelError, InvalidParametersError, @@ -30,7 +31,9 @@ from ._fastexcel import read_excel as _read_excel DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] -DTypeMap: TypeAlias = "dict[str, DType] | dict[int, DType]" +DTypeMap: TypeAlias = "dict[str | int, DType]" +ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] +DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"] class ExcelSheet: @@ -60,12 +63,12 @@ def total_height(self) -> int: return self._sheet.total_height @property - def selected_columns(self) -> list[str] | list[int] | None: + def selected_columns(self) -> list[ColumnInfo]: """The sheet's selected columns""" return self._sheet.selected_columns @property - def available_columns(self) -> list[str]: + def available_columns(self) -> list[ColumnInfo]: """The columns available for the given sheet""" return self._sheet.available_columns @@ -141,13 +144,12 @@ def load_sheet( If `None`, all rows will be used. :param use_columns: Specifies the columns to use. Can either be: - `None` to select all columns - - a list of strings, the column names - - a list of ints, the column indices (starting at 0) - - a string, a comma separated list of Excel column letters and column + - A list of strings and ints, the column names and/or indices + (starting at 0) + - A string, a comma separated list of Excel column letters and column ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in `A,B,C,D,E` and `A,C,E,F`) - :param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns` - is a list of ints or an Excel range), or column names + :param dtypes: An optional dict of dtypes. Keys can be column indices or names """ return ExcelSheet( self._reader.load_sheet( @@ -235,10 +237,22 @@ def read_excel(source: Path | str | bytes) -> ExcelReader: __all__ = ( + ## version "__version__", + ## main entrypoint "read_excel", + ## Python types + "DType", + "DTypeMap", + # Excel reader "ExcelReader", + # Excel sheet "ExcelSheet", + # Column metadata + "DTypeFrom", + "ColumnNameFrom", + "ColumnInfo", + # Exceptions "FastExcelError", "CannotRetrieveCellDataError", "CalamineCellError", diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 421c4c3..0d6d3e1 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -4,9 +4,31 @@ from typing import Literal import pyarrow as pa -_DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] +DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] +DTypeMap = dict[str | int, DType] +ColumnNameFrom = Literal["provided", "looked_up", "generated"] +DTypeFrom = Literal["provided_by_index", "provided_by_name", "guessed"] -_DTypeMap = dict[str, _DType] | dict[int, _DType] +class ColumnInfo: + def __init__( + self, + *, + name: str, + index: int, + column_name_from: ColumnNameFrom, + dtype: DType, + dtype_from: DTypeFrom, + ) -> None: ... + @property + def name(self) -> str: ... + @property + def index(self) -> int: ... + @property + def dtype(self) -> DType: ... + @property + def column_name_from(self) -> ColumnNameFrom: ... + @property + def dtype_from(self) -> DTypeFrom: ... class _ExcelSheet: @property @@ -25,13 +47,13 @@ class _ExcelSheet: def offset(self) -> int: """The sheet's offset before data starts""" @property - def selected_columns(self) -> list[str] | list[int] | None: + def selected_columns(self) -> list[ColumnInfo]: """The sheet's selected columns""" @property - def available_columns(self) -> list[str]: + def available_columns(self) -> list[ColumnInfo]: """The columns available for the given sheet""" @property - def specified_dtypes(self) -> _DTypeMap | None: + def specified_dtypes(self) -> DTypeMap | None: """The dtypes specified for the sheet""" def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch`""" @@ -49,7 +71,7 @@ class _ExcelReader: n_rows: int | None = None, schema_sample_rows: int | None = 1_000, use_columns: list[str] | list[int] | str | None = None, - dtypes: _DTypeMap | None = None, + dtypes: DTypeMap | None = None, ) -> _ExcelSheet: ... @property def sheet_names(self) -> list[str]: ... diff --git a/python/tests/test_alias_generation.py b/python/tests/test_alias_generation.py index 562dd3b..728c73a 100644 --- a/python/tests/test_alias_generation.py +++ b/python/tests/test_alias_generation.py @@ -9,14 +9,16 @@ from utils import path_for_fixture -@pytest.mark.parametrize("use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"]]) +@pytest.mark.parametrize( + "use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"], [0, "col_1", 2]] +) def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None: excel_reader = fastexcel.read_excel( path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx") ) sheet = excel_reader.load_sheet(0, use_columns=use_columns) - assert sheet.available_columns == ["col", "col_1", "col_2"] + assert [col.name for col in sheet.available_columns] == ["col", "col_1", "col_2"] pd_assert_frame_equal( sheet.to_pandas(), diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index a4cade3..ce1dca5 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -17,12 +17,27 @@ def excel_reader_single_sheet() -> fastexcel.ExcelReader: return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) -def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: +@pytest.fixture +def expected_column_info() -> list[fastexcel.ColumnInfo]: + return [ + fastexcel.ColumnInfo( + name="Month", index=0, column_name_from="looked_up", dtype="float", dtype_from="guessed" + ), + fastexcel.ColumnInfo( + name="Year", index=1, column_name_from="looked_up", dtype="float", dtype_from="guessed" + ), + ] + + +def test_single_sheet_all_columns( + excel_reader_single_sheet: fastexcel.ExcelReader, + expected_column_info: list[fastexcel.ColumnInfo], +) -> None: sheet = excel_reader_single_sheet.load_sheet(0) sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None) - assert sheet.selected_columns is None - assert sheet.available_columns == ["Month", "Year"] + assert sheet.selected_columns == expected_column_info + assert sheet.available_columns == expected_column_info expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} expected_pd_df = pd.DataFrame(expected) @@ -39,16 +54,19 @@ def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelRead pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df) -def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: +def test_single_sheet_subset_by_str( + excel_reader_single_sheet: fastexcel.ExcelReader, + expected_column_info: list[fastexcel.ColumnInfo], +) -> None: expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} # looks like mypy 1.8 became more stupid sheets: list[str | int] = [0, "January"] for sheet_name_or_idx in sheets: - for col in ["Month", "Year"]: + for idx, col in enumerate(["Month", "Year"]): sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col]) - assert sheet.selected_columns == [col] - assert sheet.available_columns == ["Month", "Year"] + assert sheet.selected_columns == [expected_column_info[idx]] + assert sheet.available_columns == expected_column_info pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]})) @@ -57,15 +75,18 @@ def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelRe pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]})) -def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: +def test_single_sheet_subset_by_index( + excel_reader_single_sheet: fastexcel.ExcelReader, + expected_column_info: list[fastexcel.ColumnInfo], +) -> None: expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} sheets: list[str | int] = [0, "January"] for sheet_name_or_idx in sheets: for idx, col_name in enumerate(["Month", "Year"]): sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx]) - assert sheet.selected_columns == [idx] - assert sheet.available_columns == ["Month", "Year"] + assert sheet.selected_columns == [expected_column_info[idx]] + assert sheet.available_columns == expected_column_info pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]})) @@ -90,9 +111,39 @@ def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]: } +@pytest.fixture +def sheet_with_unnamed_columns_expected_column_info() -> list[fastexcel.ColumnInfo]: + return [ + fastexcel.ColumnInfo( + name="col1", index=0, column_name_from="looked_up", dtype="float", dtype_from="guessed" + ), + fastexcel.ColumnInfo( + name="__UNNAMED__1", + index=1, + column_name_from="generated", + dtype="float", + dtype_from="guessed", + ), + fastexcel.ColumnInfo( + name="col3", index=2, column_name_from="looked_up", dtype="string", dtype_from="guessed" + ), + fastexcel.ColumnInfo( + name="__UNNAMED__3", + index=3, + column_name_from="generated", + dtype="float", + dtype_from="guessed", + ), + fastexcel.ColumnInfo( + name="col5", index=4, column_name_from="looked_up", dtype="string", dtype_from="guessed" + ), + ] + + def test_single_sheet_with_unnamed_columns( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], + sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: use_columns_str = ["col1", "col3", "__UNNAMED__3"] use_columns_idx = [0, 2, 3] @@ -103,8 +154,12 @@ def test_single_sheet_with_unnamed_columns( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) - assert sheet.selected_columns == use_columns_str - assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + assert sheet.selected_columns == [ + sheet_with_unnamed_columns_expected_column_info[0], + sheet_with_unnamed_columns_expected_column_info[2], + sheet_with_unnamed_columns_expected_column_info[3], + ] + assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -112,8 +167,12 @@ def test_single_sheet_with_unnamed_columns( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx ) - assert sheet.selected_columns == use_columns_idx - assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + assert sheet.selected_columns == [ + sheet_with_unnamed_columns_expected_column_info[0], + sheet_with_unnamed_columns_expected_column_info[2], + sheet_with_unnamed_columns_expected_column_info[3], + ] + assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -122,6 +181,7 @@ def test_single_sheet_with_unnamed_columns( def test_single_sheet_with_unnamed_columns_and_pagination( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], + sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: use_columns_str = ["col1", "col3", "__UNNAMED__3"] use_columns_idx = [0, 2, 3] @@ -136,7 +196,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, n_rows=1 ) - assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -144,7 +204,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, n_rows=1 ) - assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -159,7 +219,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=1 ) - assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -167,7 +227,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=1 ) - assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -189,7 +249,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names ) - assert sheet.available_columns == column_names + assert [col.name for col in sheet.available_columns] == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -197,7 +257,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names ) - assert sheet.available_columns == column_names + assert [col.name for col in sheet.available_columns] == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -208,7 +268,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names ) - assert sheet.available_columns == column_names + assert [col.name for col in sheet.available_columns] == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) @@ -216,7 +276,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names ) - assert sheet.available_columns == column_names + assert [col.name for col in sheet.available_columns] == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) @@ -225,9 +285,9 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( def test_single_sheet_with_unnamed_columns_and_str_range( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], + sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: use_columns_str = "A,C:E" - use_columns_idx = [0, 2, 3, 4] expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() @@ -236,8 +296,11 @@ def test_single_sheet_with_unnamed_columns_and_str_range( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) - assert sheet.selected_columns == use_columns_idx - assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + assert sheet.selected_columns == ( + sheet_with_unnamed_columns_expected_column_info[:1] + + sheet_with_unnamed_columns_expected_column_info[2:] + ) + assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -269,11 +332,11 @@ def test_single_sheet_invalid_column_indices_empty_list( def test_single_sheet_invalid_column_indices_column_does_not_exist_str( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: - expected_message = """column with name "nope" not found + expected_message = """column with name \"nope\" not found Context: - 0: selected columns are invalid, available columns are: ["Month", "Year"] + 0: available columns are: .* """ - with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): + with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"]) @@ -282,7 +345,7 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int( ) -> None: expected_message = """column at index 42 not found Context: - 0: selected columns are invalid, available columns are: ["Month", "Year"] + 0: available columns are: .* """ - with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): + with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]) diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py index 82392ef..acc5c4e 100644 --- a/python/tests/test_dtypes.py +++ b/python/tests/test_dtypes.py @@ -130,7 +130,7 @@ def test_sheet_with_mixed_dtypes_specify_dtypes( expected_pd_dtype: str, expected_pl_dtype: pl.DataType, ) -> None: - dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype} # type:ignore[dict-item] + dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype} excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) sheet = excel_reader.load_sheet(0, dtypes=dtypes, n_rows=5) assert sheet.specified_dtypes == dtypes diff --git a/src/lib.rs b/src/lib.rs index befda90..3ae7070 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,10 +1,9 @@ mod error; mod types; -mod utils; use error::{py_errors, ErrorContext}; use pyo3::prelude::*; -use types::python::{ExcelReader, ExcelSheet}; +use types::python::{excelsheet::column_info::ColumnInfo, ExcelReader, ExcelSheet}; /// Reads an excel file and returns an object allowing to access its sheets and a bit of metadata #[pyfunction] @@ -41,6 +40,7 @@ fn get_version() -> String { #[pymodule] fn _fastexcel(py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(read_excel, m)?)?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add("__version__", get_version())?; diff --git a/src/types/dtype.rs b/src/types/dtype.rs index 1484bfd..90e7576 100644 --- a/src/types/dtype.rs +++ b/src/types/dtype.rs @@ -1,14 +1,18 @@ -use std::{collections::HashMap, str::FromStr}; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + sync::OnceLock, +}; use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; -use pyo3::{ - types::{IntoPyDict, PyDict}, - PyObject, Python, ToPyObject, -}; +use calamine::{CellErrorType, Data as CalData, DataType, Range}; +use pyo3::{FromPyObject, PyAny, PyObject, PyResult, Python, ToPyObject}; + +use crate::error::{py_errors::IntoPyResult, FastExcelError, FastExcelErrorKind, FastExcelResult}; -use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult}; +use super::idx_or_name::IdxOrName; -#[derive(Debug)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)] pub(crate) enum DType { Null, Int, @@ -41,8 +45,8 @@ impl FromStr for DType { } } -impl ToPyObject for DType { - fn to_object(&self, py: Python<'_>) -> PyObject { +impl ToString for DType { + fn to_string(&self) -> String { match self { DType::Null => "null", DType::Int => "int", @@ -53,83 +57,32 @@ impl ToPyObject for DType { DType::Date => "date", DType::Duration => "duration", } - .to_object(py) - } -} - -#[derive(Debug)] -pub(crate) enum DTypeMap { - ByIndex(HashMap), - ByName(HashMap), -} - -impl DTypeMap { - pub(crate) fn dtype_for_col_name(&self, col_name: &String) -> Option<&DType> { - match self { - DTypeMap::ByName(name_map) => name_map.get(col_name), - _ => None, - } - } - - pub(crate) fn dtype_for_col_idx(&self, col_idx: usize) -> Option<&DType> { - match self { - DTypeMap::ByIndex(idx_map) => idx_map.get(&col_idx), - _ => None, - } - } -} - -impl> TryFrom> for DTypeMap { - type Error = FastExcelError; - - fn try_from(value: HashMap) -> FastExcelResult { - value - .into_iter() - .map(|(column, raw_dtype)| { - raw_dtype - .as_ref() - .parse() - .map(|raw_dtype| (column, raw_dtype)) - }) - .collect::>>() - .map(Self::ByIndex) + .to_string() } } -impl> TryFrom> for DTypeMap { - type Error = FastExcelError; - - fn try_from(value: HashMap) -> FastExcelResult { - value - .into_iter() - .map(|(column, raw_dtype)| { - raw_dtype - .as_ref() - .parse() - .map(|raw_dtype| (column, raw_dtype)) - }) - .collect::>>() - .map(Self::ByName) +impl ToPyObject for DType { + fn to_object(&self, py: Python<'_>) -> PyObject { + self.to_string().to_object(py) } } -impl TryFrom<&PyDict> for DTypeMap { - type Error = FastExcelError; - - fn try_from(py_dict: &PyDict) -> FastExcelResult { - if let Ok(string_map) = py_dict.extract::>() { - string_map.try_into() - } else if let Ok(string_map) = py_dict.extract::>() { - string_map.try_into() +impl FromPyObject<'_> for DType { + fn extract(py_dtype: &PyAny) -> PyResult { + if let Ok(dtype_str) = py_dtype.extract::<&str>() { + dtype_str.parse() } else { Err(FastExcelErrorKind::InvalidParameters(format!( - "unsupported dtype map: {py_dict:?}" + "{py_dtype:?} cannot be converted to str" )) .into()) } + .into_pyresult() } } +pub(crate) type DTypeMap = HashMap; + impl From<&DType> for ArrowDataType { fn from(dtype: &DType) -> Self { match dtype { @@ -145,20 +98,166 @@ impl From<&DType> for ArrowDataType { } } -impl ToPyObject for DTypeMap { - fn to_object(&self, py: Python<'_>) -> PyObject { - match self { - DTypeMap::ByIndex(idx_map) => idx_map - .iter() - .map(|(k, v)| (k, v.to_object(py))) - .into_py_dict(py) - .into(), +/// All the possible string values that should be considered as NULL +const NULL_STRING_VALUES: [&str; 19] = [ + "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", + "", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null", +]; + +fn get_cell_dtype(data: &Range, row: usize, col: usize) -> FastExcelResult { + let cell = data + .get((row, col)) + .ok_or_else(|| FastExcelErrorKind::CannotRetrieveCellData(row, col))?; + + match cell { + CalData::Int(_) => Ok(DType::Int), + CalData::Float(_) => Ok(DType::Float), + CalData::String(v) => match v { + v if NULL_STRING_VALUES.contains(&v.as_str()) => Ok(DType::Null), + _ => Ok(DType::String), + }, + CalData::Bool(_) => Ok(DType::Bool), + // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be + // a duration or a datatime + CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() { + DType::DateTime + } else { + DType::Duration + }), + // These types contain an ISO8601 representation of a date/datetime or a duration + CalData::DateTimeIso(_) => match cell.as_datetime() { + Some(_) => Ok(DType::DateTime), + // If we cannot convert the cell to a datetime, we're working on a date + // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime + // rather than a date + None => Ok(DType::Date), + }, + // A simple duration + CalData::DurationIso(_) => Ok(DType::Duration), + // Errors and nulls + CalData::Error(err) => match err { + CellErrorType::NA => Ok(DType::Null), + _ => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()), + }, + CalData::Empty => Ok(DType::Null), + } +} + +static FLOAT_TYPES_CELL: OnceLock> = OnceLock::new(); +static INT_TYPES_CELL: OnceLock> = OnceLock::new(); +static STRING_TYPES_CELL: OnceLock> = OnceLock::new(); + +fn float_types() -> &'static HashSet { + FLOAT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::Bool])) +} + +fn int_types() -> &'static HashSet { + INT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Bool])) +} - DTypeMap::ByName(name_map) => name_map - .iter() - .map(|(k, v)| (k, v.to_object(py))) - .into_py_dict(py) +fn string_types() -> &'static HashSet { + STRING_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::String])) +} + +pub(crate) fn get_dtype_for_column( + data: &Range, + start_row: usize, + end_row: usize, + col: usize, +) -> FastExcelResult { + let mut column_types = (start_row..end_row) + .map(|row| get_cell_dtype(data, row, col)) + .collect::>>()?; + + // All columns are nullable anyway so we're not taking Null into account here + column_types.remove(&DType::Null); + + if column_types.is_empty() { + // If no type apart from NULL was found, it's a NULL column + Ok(DType::Null) + } else if column_types.len() == 1 { + // If a single non-null type was found, return it + Ok(column_types.into_iter().next().unwrap()) + } else if column_types.is_subset(int_types()) { + // If every cell in the column can be converted to an int, return int64 + Ok(DType::Int) + } else if column_types.is_subset(float_types()) { + // If every cell in the column can be converted to a float, return Float64 + Ok(DType::Float) + } else if column_types.is_subset(string_types()) { + // If every cell in the column can be converted to a string, return Utf8 + Ok(DType::String) + } else { + // NOTE: Not being too smart about multi-types columns for now + Err( + FastExcelErrorKind::UnsupportedColumnTypeCombination(format!("{column_types:?}")) .into(), - } + ) + } +} + +#[cfg(test)] +mod tests { + use calamine::Cell; + use rstest::{fixture, rstest}; + + use super::*; + + #[fixture] + fn range() -> Range { + Range::from_sparse(vec![ + // First column + Cell::new((0, 0), CalData::Bool(true)), + Cell::new((1, 0), CalData::Bool(false)), + Cell::new((2, 0), CalData::String("NULL".to_string())), + Cell::new((3, 0), CalData::Int(42)), + Cell::new((4, 0), CalData::Float(13.37)), + Cell::new((5, 0), CalData::String("hello".to_string())), + Cell::new((6, 0), CalData::Empty), + Cell::new((7, 0), CalData::String("#N/A".to_string())), + Cell::new((8, 0), CalData::Int(12)), + Cell::new((9, 0), CalData::Float(12.21)), + Cell::new((10, 0), CalData::Bool(true)), + Cell::new((11, 0), CalData::Int(1337)), + ]) + } + + #[rstest] + // pure bool + #[case(0, 2, DType::Bool)] + // pure int + #[case(3, 4, DType::Int)] + // pure float + #[case(4, 5, DType::Float)] + // pure string + #[case(5, 6, DType::String)] + // pure int + float + #[case(3, 5, DType::Float)] + // null + int + float + #[case(2, 5, DType::Float)] + // float + string + #[case(4, 6, DType::String)] + // int + float + string + #[case(3, 6, DType::String)] + // null + int + float + string + empty + null + #[case(2, 8, DType::String)] + // empty + null + int + #[case(6, 9, DType::Int)] + // int + float + null + #[case(7, 10, DType::Float)] + // int + float + bool + null + #[case(7, 11, DType::Float)] + // int + bool + #[case(10, 12, DType::Int)] + fn get_arrow_column_type_multi_dtype_ok( + range: Range, + #[case] start_row: usize, + #[case] end_row: usize, + #[case] expected: DType, + ) { + assert_eq!( + get_dtype_for_column(&range, start_row, end_row, 0).unwrap(), + expected + ); } } diff --git a/src/types/idx_or_name.rs b/src/types/idx_or_name.rs index 503c53d..6788530 100644 --- a/src/types/idx_or_name.rs +++ b/src/types/idx_or_name.rs @@ -1,8 +1,8 @@ -use pyo3::PyAny; +use pyo3::{FromPyObject, PyAny, PyObject, PyResult, Python, ToPyObject}; -use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult}; +use crate::error::{py_errors::IntoPyResult, FastExcelError, FastExcelErrorKind, FastExcelResult}; -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq, Hash, Clone)] pub(crate) enum IdxOrName { Idx(usize), Name(String), @@ -20,16 +20,43 @@ impl IdxOrName { impl TryFrom<&PyAny> for IdxOrName { type Error = FastExcelError; - fn try_from(py_any: &PyAny) -> FastExcelResult { - if let Ok(name) = py_any.extract::() { - Ok(IdxOrName::Name(name)) - } else if let Ok(index) = py_any.extract::() { - Ok(IdxOrName::Idx(index)) + fn try_from(value: &PyAny) -> FastExcelResult { + if let Ok(index) = value.extract() { + Ok(Self::Idx(index)) + } else if let Ok(name) = value.extract() { + Ok(Self::Name(name)) } else { Err(FastExcelErrorKind::InvalidParameters(format!( - "cannot create IdxOrName from {py_any:?}" + "cannot create IdxOrName from {value:?}" )) .into()) } } } + +impl FromPyObject<'_> for IdxOrName { + fn extract(value: &PyAny) -> PyResult { + value.try_into().into_pyresult() + } +} + +impl ToPyObject for IdxOrName { + fn to_object(&self, py: Python<'_>) -> PyObject { + match self { + IdxOrName::Idx(idx) => idx.to_object(py), + IdxOrName::Name(name) => name.to_object(py), + } + } +} + +impl From for IdxOrName { + fn from(index: usize) -> Self { + Self::Idx(index) + } +} + +impl From for IdxOrName { + fn from(name: String) -> Self { + Self::Name(name) + } +} diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs index d53589c..8de50de 100644 --- a/src/types/python/excelreader.rs +++ b/src/types/python/excelreader.rs @@ -4,7 +4,7 @@ use std::{ }; use calamine::{open_workbook_auto, open_workbook_auto_from_rs, Data, Range, Reader, Sheets}; -use pyo3::{pyclass, pymethods, types::PyDict, PyAny, PyResult}; +use pyo3::{pyclass, pymethods, PyAny, PyResult}; use crate::{ error::{ @@ -62,14 +62,6 @@ impl ExcelReader { }) } - fn build_dtypes(raw_dtypes: Option<&PyDict>) -> FastExcelResult> { - match raw_dtypes { - None => Ok(None), - Some(py_dict) => py_dict.try_into().map(Some), - } - .with_context(|| "could not parse provided dtypes") - } - fn build_selected_columns(use_columns: Option<&PyAny>) -> FastExcelResult { use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")) } @@ -84,14 +76,13 @@ impl ExcelReader { n_rows: Option, schema_sample_rows: Option, use_columns: Option<&PyAny>, - dtypes: Option<&PyDict>, + dtypes: Option, ) -> FastExcelResult { let range = self.sheets.worksheet_range(&name)?; let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range)?; let selected_columns = Self::build_selected_columns(use_columns)?; - let dtypes = Self::build_dtypes(dtypes)?; ExcelSheet::try_new( name, range, @@ -148,7 +139,7 @@ impl ExcelReader { n_rows: Option, schema_sample_rows: Option, use_columns: Option<&PyAny>, - dtypes: Option<&PyDict>, + dtypes: Option, ) -> PyResult { let name = idx_or_name .try_into() diff --git a/src/types/python/excelsheet/column_info.rs b/src/types/python/excelsheet/column_info.rs new file mode 100644 index 0000000..d26882e --- /dev/null +++ b/src/types/python/excelsheet/column_info.rs @@ -0,0 +1,273 @@ +use std::{str::FromStr, usize}; + +use calamine::{Data as CalData, Range}; +use pyo3::{pyclass, pymethods, PyResult}; + +use crate::{ + error::{ + py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, + }, + types::{ + dtype::{get_dtype_for_column, DType, DTypeMap}, + idx_or_name::IdxOrName, + }, +}; + +#[derive(Debug, Clone, PartialEq)] +pub(crate) enum ColumnNameFrom { + Provided, + LookedUp, + Generated, +} + +impl FromStr for ColumnNameFrom { + type Err = FastExcelError; + + fn from_str(s: &str) -> FastExcelResult { + match s { + "provided" => Ok(Self::Provided), + "looked_up" => Ok(Self::LookedUp), + "generated" => Ok(Self::Generated), + _ => Err( + FastExcelErrorKind::InvalidParameters(format!("invalid ColumnNameFrom: {s}")) + .into(), + ), + } + } +} + +impl ToString for ColumnNameFrom { + fn to_string(&self) -> String { + match self { + ColumnNameFrom::Provided => "provided", + ColumnNameFrom::LookedUp => "looked_up", + ColumnNameFrom::Generated => "generated", + } + .to_string() + } +} + +#[derive(Debug, Clone, PartialEq)] +pub(crate) enum DTypeFrom { + ProvidedByIndex, + ProvidedByName, + Guessed, +} + +impl ToString for DTypeFrom { + fn to_string(&self) -> String { + match self { + DTypeFrom::ProvidedByIndex => "provided_by_index", + DTypeFrom::ProvidedByName => "provided_by_name", + DTypeFrom::Guessed => "guessed", + } + .to_string() + } +} + +impl FromStr for DTypeFrom { + type Err = FastExcelError; + + fn from_str(s: &str) -> FastExcelResult { + match s { + "provided_by_index" => Ok(Self::ProvidedByIndex), + "provided_by_name" => Ok(Self::ProvidedByName), + "guessed" => Ok(Self::Guessed), + _ => Err( + FastExcelErrorKind::InvalidParameters(format!("invalid DTypesFrom: {s}")).into(), + ), + } + } +} + +// NOTE: The types for properties unfortunately do not appear in the docs for this class, so we had +// to specify them via docstrings +/// This class provides information about a single column in a sheet +#[derive(Debug, Clone, PartialEq)] +#[pyclass(name = "ColumnInfo")] +pub(crate) struct ColumnInfo { + /// `str`. The name of the column + #[pyo3(get)] + name: String, + /// `int`. The index of the column + #[pyo3(get)] + index: usize, + dtype: DType, + column_name_from: ColumnNameFrom, + dtype_from: DTypeFrom, +} + +impl ColumnInfo { + pub(crate) fn new( + name: String, + index: usize, + column_name_from: ColumnNameFrom, + dtype: DType, + dtype_from: DTypeFrom, + ) -> Self { + Self { + name, + index, + dtype, + column_name_from, + dtype_from, + } + } + + pub(crate) fn name(&self) -> &str { + &self.name + } + + pub(crate) fn index(&self) -> usize { + self.index + } + + pub(crate) fn dtype(&self) -> &DType { + &self.dtype + } +} + +#[pymethods] +impl ColumnInfo { + /// Creates a new ColumnInfo object. + /// + /// - `name`: `str`. The name of the column + /// - `index`: `int`. The index of the column. Must be >=0 + /// - `column_name_from`: `fastexcel.ColumnNameFrom`. The origin of the column name + /// - `dtype`: `fastexcel.DType`. The dtype of the column + /// - `dtype_from`: `fastexcel.DTypeFrom`. The origin of the dtype for the column + #[new] + pub(crate) fn py_new( + name: String, + index: usize, + column_name_from: &str, + dtype: &str, + dtype_from: &str, + ) -> PyResult { + Ok(Self::new( + name, + index, + column_name_from.parse().into_pyresult()?, + dtype.parse().into_pyresult()?, + dtype_from.parse().into_pyresult()?, + )) + } + /// `fastexcel.DType`. The dtype of the column + #[getter(dtype)] + fn get_dtype(&self) -> String { + self.dtype.to_string() + } + + /// `fastexcel.ColumnNameFrom`. How the name of the column was determined. + /// + /// One of three possible values: + /// - `"provided"`: The column name was provided via the `use_columns` parameter + /// - `"looked_up"`: The column name was looked up from the data found in the sheet + /// - `"generated"`: The column name was generated from the column index, either because + /// `header_row` was `None`, or because it could not be looked up + #[getter(column_name_from)] + fn get_colum_name_from(&self) -> String { + self.column_name_from.to_string() + } + + /// `fastexcel.DTypeFrom`. How the dtype of the column was determined. + /// + /// One of three possible values: + /// - `"provided_by_index"`: The dtype was specified via the column index + /// - `"provided_by_name"`: The dtype was specified via the column name + /// - `"guessed"`: The dtype was determined from the content of the column + #[getter(dtype_from)] + fn get_dtype_from(&self) -> String { + self.dtype_from.to_string() + } + + pub fn __repr__(&self) -> String { + format!("ColumnInfo", name=self.name, index=self.index, dtype=self.dtype.to_string(), dtype_from=self.dtype_from.to_string(), column_name_from=self.column_name_from.to_string()) + } + + pub fn __eq__(&self, other: &Self) -> bool { + self == other + } +} + +#[derive(Debug)] +pub(super) struct ColumnInfoBuilder { + name: String, + index: usize, + column_name_from: ColumnNameFrom, +} + +// Allows us to easily compare ourselves to a column index or name +impl PartialEq for ColumnInfoBuilder { + fn eq(&self, other: &IdxOrName) -> bool { + match other { + IdxOrName::Idx(index) => index == &self.index, + IdxOrName::Name(name) => name == &self.name, + } + } +} + +impl ColumnInfoBuilder { + pub(super) fn new(name: String, index: usize, column_name_from: ColumnNameFrom) -> Self { + Self { + name, + index, + column_name_from, + } + } + + pub(super) fn with_name(mut self, name: String) -> Self { + self.name = name; + self + } + + pub(super) fn name(&self) -> &str { + &self.name + } + + fn dtype_info( + &self, + data: &Range, + start_row: usize, + end_row: usize, + specified_dtypes: Option<&DTypeMap>, + ) -> FastExcelResult<(DType, DTypeFrom)> { + specified_dtypes + .and_then(|dtypes| { + // if we have dtypes, look the dtype up by index, and fall back on a lookup by name + // (done in this order because copying an usize is cheaper than cloning a string) + if let Some(dtype) = dtypes.get(&self.index.into()) { + Some((*dtype, DTypeFrom::ProvidedByIndex)) + } else { + dtypes + .get(&self.name.clone().into()) + .map(|dtype| (*dtype, DTypeFrom::ProvidedByName)) + } + }) + .map(FastExcelResult::Ok) + // If we could not look up a dtype, guess it from the data + .unwrap_or_else(|| { + get_dtype_for_column(data, start_row, end_row, self.index) + .map(|dtype| (dtype, DTypeFrom::Guessed)) + }) + } + + pub(super) fn finish( + self, + data: &Range, + start_row: usize, + end_row: usize, + specified_dtypes: Option<&DTypeMap>, + ) -> FastExcelResult { + let (dtype, dtype_from) = self + .dtype_info(data, start_row, end_row, specified_dtypes) + .with_context(|| format!("could not determine dtype for column {}", self.name))?; + Ok(ColumnInfo::new( + self.name, + self.index, + self.column_name_from, + dtype, + dtype_from, + )) + } +} diff --git a/src/types/python/excelsheet.rs b/src/types/python/excelsheet/mod.rs similarity index 70% rename from src/types/python/excelsheet.rs rename to src/types/python/excelsheet/mod.rs index e0b5fde..616a3ce 100644 --- a/src/types/python/excelsheet.rs +++ b/src/types/python/excelsheet/mod.rs @@ -1,11 +1,15 @@ +pub(crate) mod column_info; + use std::{cmp, collections::HashSet, str::FromStr, sync::Arc}; use crate::{ error::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, - types::{dtype::DTypeMap, idx_or_name::IdxOrName}, - utils::arrow::alias_for_name, + types::{ + dtype::{DType, DTypeMap}, + idx_or_name::IdxOrName, + }, }; use arrow::{ @@ -13,7 +17,7 @@ use arrow::{ Array, BooleanArray, Date32Array, DurationMillisecondArray, Float64Array, Int64Array, NullArray, StringArray, TimestampMillisecondArray, }, - datatypes::{DataType as ArrowDataType, Schema, TimeUnit}, + datatypes::{Field, Schema}, pyarrow::ToPyArrow, record_batch::RecordBatch, }; @@ -26,7 +30,9 @@ use pyo3::{ PyAny, PyResult, ToPyObject, }; -use crate::utils::arrow::arrow_schema_from_column_names_and_range; +// use crate::utils::arrow::arrow_schema_from_column_names_and_range; + +use self::column_info::{ColumnInfo, ColumnInfoBuilder, ColumnNameFrom}; #[derive(Debug)] pub(crate) enum Header { @@ -81,77 +87,6 @@ impl Pagination { self.skip_rows } } - -#[derive(Debug, PartialEq)] -pub(crate) enum SelectedColumns { - All, - ByIndex(Vec), - ByName(Vec), -} - -impl SelectedColumns { - pub(crate) fn validate_columns(&self, column_names: &[String]) -> FastExcelResult<()> { - match self { - SelectedColumns::All => Ok(()), - // If no selected indice is >= to the len of column_names, we're good - SelectedColumns::ByIndex(indices) => indices.iter().try_for_each(|idx| { - if idx >= &column_names.len() { - Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Idx(*idx)).into()) - } else { - Ok(()) - } - }), - // Every selected column must be in the provided column_names - SelectedColumns::ByName(selected_names) => { - selected_names.iter().try_for_each(|selected_name| { - if column_names.contains(selected_name) { - Ok(()) - } else { - Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Name( - selected_name.to_string(), - )) - .into()) - } - }) - } - } - } - - pub(crate) fn idx_for_column( - &self, - col_names: &[String], - col_name: &str, - col_idx: usize, - ) -> Option { - match self { - SelectedColumns::All => None, - SelectedColumns::ByIndex(indices) => { - if indices.contains(&col_idx) { - Some(col_idx) - } else { - None - } - } - SelectedColumns::ByName(names) => { - // cannot use .contains() because we have &String and &str - if names.iter().any(|name| name == col_name) { - col_names.iter().position(|name| name == col_name) - } else { - None - } - } - } - } - - pub(crate) fn to_python<'p>(&self, py: Python<'p>) -> Option<&'p PyList> { - match self { - SelectedColumns::All => None, - SelectedColumns::ByIndex(idx_vec) => Some(PyList::new(py, idx_vec)), - SelectedColumns::ByName(name_vec) => Some(PyList::new(py, name_vec)), - } - } -} - impl TryFrom<&PyList> for SelectedColumns { type Error = FastExcelError; @@ -160,10 +95,8 @@ impl TryFrom<&PyList> for SelectedColumns { if py_list.is_empty() { Err(InvalidParameters("list of selected columns is empty".to_string()).into()) - } else if let Ok(name_vec) = py_list.extract::>() { - Ok(Self::ByName(name_vec)) - } else if let Ok(index_vec) = py_list.extract::>() { - Ok(Self::ByIndex(index_vec)) + } else if let Ok(selection) = py_list.extract::>() { + Ok(Self::Selection(selection)) } else { Err( InvalidParameters(format!("expected list[int] | list[str], got {py_list:?}")) @@ -173,7 +106,39 @@ impl TryFrom<&PyList> for SelectedColumns { } } +#[derive(Debug, PartialEq)] +pub(crate) enum SelectedColumns { + All, + Selection(Vec), +} + impl SelectedColumns { + pub(super) fn select_columns( + &self, + column_info: &[ColumnInfo], + ) -> FastExcelResult> { + match self { + SelectedColumns::All => Ok(column_info.to_vec()), + SelectedColumns::Selection(selection) => selection + .iter() + .map(|selected_column| { + match selected_column { + IdxOrName::Idx(index) => column_info + .iter() + .find(|col_info| &col_info.index() == index), + IdxOrName::Name(name) => column_info + .iter() + .find(|col_info| col_info.name() == name.as_str()), + } + .ok_or_else(|| { + FastExcelErrorKind::ColumnNotFound(selected_column.clone()).into() + }) + .map(Clone::clone) + .with_context(|| format!("available columns are: {column_info:?}")) + }) + .collect(), + } + } const ALPHABET: [char; 26] = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', @@ -272,7 +237,9 @@ impl FromStr for SelectedColumns { .collect(); let mut sorted_col_indices: Vec = unique_col_indices.into_iter().collect(); sorted_col_indices.sort(); - Ok(Self::ByIndex(sorted_col_indices)) + Ok(Self::Selection( + sorted_col_indices.into_iter().map(IdxOrName::Idx).collect(), + )) } } @@ -311,6 +278,25 @@ impl TryFrom> for SelectedColumns { } } +fn alias_for_name(name: &str, existing_names: &[String]) -> String { + fn rec(name: &str, existing_names: &[String], depth: usize) -> String { + let alias = if depth == 0 { + name.to_owned() + } else { + format!("{name}_{depth}") + }; + match existing_names + .iter() + .any(|existing_name| existing_name == &alias) + { + true => rec(name, existing_names, depth + 1), + false => alias, + } + } + + rec(name, existing_names, 0) +} + #[pyclass(name = "_ExcelSheet")] pub(crate) struct ExcelSheet { #[pyo3(get)] @@ -322,8 +308,9 @@ pub(crate) struct ExcelSheet { total_height: Option, width: Option, schema_sample_rows: Option, - selected_columns: SelectedColumns, - available_columns: Vec, + // selected_columns: SelectedColumns, + selected_columns: Vec, + available_columns: Vec, dtypes: Option, } @@ -342,12 +329,7 @@ impl ExcelSheet { dtypes: Option, ) -> FastExcelResult { // Ensuring dtypes are compatible with selected columns - match (&dtypes, &selected_columns) { - (None, _) | (_, SelectedColumns::All) => Ok::<(), FastExcelError>(()), - (Some(DTypeMap::ByIndex(_)), SelectedColumns::ByIndex(_)) => Ok(()), - (Some(DTypeMap::ByName(_)), SelectedColumns::ByName(_)) => Ok(()), - (Some(other), selected_columns) => Err(FastExcelErrorKind::InvalidParameters(format!("invalid dtypes and selected column combiantion, got \"{other:?}\" and \"{selected_columns:?}\"")).into()) - }?; + // Self::validate_dtypes_and_selected_columns(&selected_columns, &dtypes)?; let mut sheet = ExcelSheet { name, @@ -355,59 +337,94 @@ impl ExcelSheet { pagination, data, schema_sample_rows, - selected_columns, dtypes, height: None, total_height: None, width: None, - // an empty vec as it will be replaced + // Empty vecs as they'll be replaced available_columns: Vec::with_capacity(0), + selected_columns: Vec::with_capacity(0), }; - let available_columns = sheet.get_available_columns(); + let available_columns_info = sheet.get_available_columns_info(); - let mut aliased_available_columns = Vec::with_capacity(available_columns.len()); + let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len()); - available_columns.iter().for_each(|column_name| { - aliased_available_columns.push(alias_for_name(column_name, &aliased_available_columns)) - }); + let dtype_sample_rows = + sheet.offset() + sheet.schema_sample_rows().unwrap_or(sheet.limit()); + let row_limit = cmp::min(dtype_sample_rows, sheet.limit()); - // Ensuring selected columns are valid - sheet - .selected_columns - .validate_columns(&aliased_available_columns) - .with_context(|| { - format!( - "selected columns are invalid, available columns are: {available_columns:?}" + // Finalizing column info + let available_columns = available_columns_info + .into_iter() + .map(|mut column_info_builder| { + // Setting the right alias for every column + let alias = alias_for_name(column_info_builder.name(), &aliased_available_columns); + if alias != column_info_builder.name() { + column_info_builder = column_info_builder.with_name(alias.clone()); + } + aliased_available_columns.push(alias); + // Setting the dtype info + column_info_builder.finish( + &sheet.data, + sheet.offset(), + row_limit, + sheet.dtypes.as_ref(), ) - })?; + }) + .collect::>>()?; + let selected_columns = selected_columns.select_columns(&available_columns)?; + sheet.available_columns = available_columns; + sheet.selected_columns = selected_columns; - sheet.available_columns = aliased_available_columns; + // Figure out dtype for every column Ok(sheet) } - fn get_available_columns(&self) -> Vec { + fn get_available_columns_info(&self) -> Vec { let width = self.data.width(); match &self.header { Header::None => (0..width) - .map(|col_idx| format!("__UNNAMED__{col_idx}")) + .map(|col_idx| { + ColumnInfoBuilder::new( + format!("__UNNAMED__{col_idx}"), + col_idx, + ColumnNameFrom::Generated, + ) + }) .collect(), Header::At(row_idx) => (0..width) .map(|col_idx| { self.data .get((*row_idx, col_idx)) .and_then(|data| data.as_string()) - .unwrap_or(format!("__UNNAMED__{col_idx}")) + .map(|col_name| { + ColumnInfoBuilder::new(col_name, col_idx, ColumnNameFrom::LookedUp) + }) + .unwrap_or_else(|| { + ColumnInfoBuilder::new( + format!("__UNNAMED__{col_idx}"), + col_idx, + ColumnNameFrom::Generated, + ) + }) }) .collect(), Header::With(names) => { let nameless_start_idx = names.len(); names .iter() - .map(ToOwned::to_owned) - .chain( - (nameless_start_idx..width).map(|col_idx| format!("__UNNAMED__{col_idx}")), - ) + .enumerate() + .map(|(col_idx, name)| { + ColumnInfoBuilder::new(name.to_owned(), col_idx, ColumnNameFrom::Provided) + }) + .chain((nameless_start_idx..width).map(|col_idx| { + ColumnInfoBuilder::new( + format!("__UNNAMED__{col_idx}"), + col_idx, + ColumnNameFrom::Generated, + ) + })) .collect() } } @@ -533,23 +550,14 @@ fn create_duration_array( )) } -impl TryFrom<&ExcelSheet> for Schema { - type Error = FastExcelError; - - fn try_from(sheet: &ExcelSheet) -> Result { - // Checking how many rows we want to use to determine the dtype for a column. If sample_rows is - // not provided, we sample limit rows, i.e on the entire column - let sample_rows = sheet.offset() + sheet.schema_sample_rows().unwrap_or(sheet.limit()); - - arrow_schema_from_column_names_and_range( - sheet.data(), - &sheet.available_columns, - sheet.offset(), - // If sample_rows is higher than the sheet's limit, use the limit instead - cmp::min(sample_rows, sheet.limit()), - &sheet.selected_columns, - sheet.dtypes.as_ref(), - ) +impl From<&ExcelSheet> for Schema { + fn from(sheet: &ExcelSheet) -> Self { + let fields: Vec<_> = sheet + .selected_columns + .iter() + .map(|col_info| Field::new(col_info.name(), col_info.dtype().into(), true)) + .collect(); + Schema::new(fields) } } @@ -560,65 +568,45 @@ impl TryFrom<&ExcelSheet> for RecordBatch { let offset = sheet.offset(); let limit = sheet.limit(); - let schema = Schema::try_from(sheet) - .with_context(|| format!("could not build schema for sheet {}", sheet.name))?; - let mut iter = sheet - .available_columns + .selected_columns .iter() - .enumerate() - .filter_map(|(idx, column_name)| { - // checking if the current column has been selected - if let Some(col_idx) = match sheet.selected_columns { - // All columns selected, return the current index - SelectedColumns::All => Some(idx), - // Otherwise, return its index. If None is found, it means the column was not - // selected, and we will just continue - _ => sheet.selected_columns.idx_for_column( - &sheet.available_columns, - column_name, - idx, - ), - } { - // At this point, we know for sure that the column is in the schema so we can - // safely unwrap - let field = schema.field_with_name(column_name).unwrap(); - Some(( - field.name(), - match field.data_type() { - ArrowDataType::Boolean => { - create_boolean_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Int64 => { - create_int_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Float64 => { - create_float_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Utf8 => { - create_string_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { - create_datetime_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Date32 => { - create_date_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Duration(TimeUnit::Millisecond) => { - create_duration_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), - _ => unreachable!(), - }, - )) - } else { - None - } + .map(|column_info| { + // At this point, we know for sure that the column is in the schema so we can + // safely unwrap + ( + column_info.name(), + match column_info.dtype() { + DType::Bool => { + create_boolean_array(sheet.data(), column_info.index(), offset, limit) + } + DType::Int => { + create_int_array(sheet.data(), column_info.index(), offset, limit) + } + DType::Float => { + create_float_array(sheet.data(), column_info.index(), offset, limit) + } + DType::String => { + create_string_array(sheet.data(), column_info.index(), offset, limit) + } + DType::DateTime => { + create_datetime_array(sheet.data(), column_info.index(), offset, limit) + } + DType::Date => { + create_date_array(sheet.data(), column_info.index(), offset, limit) + } + DType::Duration => { + create_duration_array(sheet.data(), column_info.index(), offset, limit) + } + DType::Null => Arc::new(NullArray::new(limit - offset)), + }, + ) }) .peekable(); // If the iterable is empty, try_from_iter returns an Err if iter.peek().is_none() { + let schema: Schema = sheet.into(); Ok(RecordBatch::new_empty(Arc::new(schema))) } else { RecordBatch::try_from_iter(iter) @@ -663,13 +651,13 @@ impl ExcelSheet { } #[getter] - pub fn selected_columns<'p>(&'p self, py: Python<'p>) -> Option<&PyList> { - self.selected_columns.to_python(py) + pub fn selected_columns<'p>(&'p self, _py: Python<'p>) -> Vec { + self.selected_columns.clone() } #[getter] - pub fn available_columns<'p>(&'p self, py: Python<'p>) -> &PyList { - PyList::new(py, &self.available_columns) + pub fn available_columns<'p>(&'p self, _py: Python<'p>) -> Vec { + self.available_columns.clone() } #[getter] @@ -718,7 +706,7 @@ mod tests { let py_list = PyList::new(py, vec![0, 1, 2]).as_ref(); assert_eq!( TryInto::::try_into(Some(py_list)).unwrap(), - SelectedColumns::ByIndex(vec![0, 1, 2]) + SelectedColumns::Selection([0, 1, 2].into_iter().map(IdxOrName::Idx).collect()) ) }); } @@ -729,7 +717,31 @@ mod tests { let py_list = PyList::new(py, vec!["foo", "bar"]).as_ref(); assert_eq!( TryInto::::try_into(Some(py_list)).unwrap(), - SelectedColumns::ByName(vec!["foo".to_string(), "bar".to_string()]) + SelectedColumns::Selection( + ["foo", "bar"] + .iter() + .map(ToString::to_string) + .map(IdxOrName::Name) + .collect() + ) + ) + }); + } + + #[test] + fn selected_columns_from_list_of_valid_strings_and_ints() { + Python::with_gil(|py| { + let py_list = PyList::new(py, vec!["foo", "bar"]); + py_list.append(42).unwrap(); + py_list.append(5).unwrap(); + assert_eq!( + TryInto::::try_into(Some(py_list.as_ref())).unwrap(), + SelectedColumns::Selection(vec![ + IdxOrName::Name("foo".to_string()), + IdxOrName::Name("bar".to_string()), + IdxOrName::Idx(42), + IdxOrName::Idx(5) + ]) ) }); } @@ -774,9 +786,11 @@ mod tests { // Ranges beyond Z #[case("A,y:AB", vec![0, 24, 25, 26, 27])] #[case("BB:BE,DDC:DDF", vec![53, 54, 55, 56, 2810, 2811, 2812, 2813])] - fn selected_columns_from_valid_ranges(#[case] raw: &str, #[case] expected: Vec) { + fn selected_columns_from_valid_ranges(#[case] raw: &str, #[case] expected_indices: Vec) { Python::with_gil(|py| { - let expected_range = SelectedColumns::ByIndex(expected); + let expected_range = SelectedColumns::Selection( + expected_indices.into_iter().map(IdxOrName::Idx).collect(), + ); let input = PyString::new(py, raw).as_ref(); let range = TryInto::::try_into(Some(input)) diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs deleted file mode 100644 index 0f7d42b..0000000 --- a/src/utils/arrow.rs +++ /dev/null @@ -1,270 +0,0 @@ -use std::{collections::HashSet, sync::OnceLock, usize}; - -use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; -use calamine::{CellErrorType, Data as CalData, DataType, Range}; - -use crate::{ - error::{FastExcelErrorKind, FastExcelResult}, - types::{dtype::DTypeMap, python::excelsheet::SelectedColumns}, -}; - -/// All the possible string values that should be considered as NULL -const NULL_STRING_VALUES: [&str; 19] = [ - "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", - "", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null", -]; - -fn get_cell_type(data: &Range, row: usize, col: usize) -> FastExcelResult { - let cell = data - .get((row, col)) - .ok_or_else(|| FastExcelErrorKind::CannotRetrieveCellData(row, col))?; - - match cell { - CalData::Int(_) => Ok(ArrowDataType::Int64), - CalData::Float(_) => Ok(ArrowDataType::Float64), - CalData::String(v) => match v { - v if NULL_STRING_VALUES.contains(&v.as_str()) => Ok(ArrowDataType::Null), - _ => Ok(ArrowDataType::Utf8), - }, - CalData::Bool(_) => Ok(ArrowDataType::Boolean), - // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be - // a duration or a datatime - CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() { - ArrowDataType::Timestamp(TimeUnit::Millisecond, None) - } else { - ArrowDataType::Duration(TimeUnit::Millisecond) - }), - // These types contain an ISO8601 representation of a date/datetime or a duration - CalData::DateTimeIso(_) => match cell.as_datetime() { - // If we cannot convert the cell to a datetime, we're working on a date - Some(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)), - // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime - // rather than a date - None => Ok(ArrowDataType::Date32), - }, - // A simple duration - CalData::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), - // Errors and nulls - CalData::Error(err) => match err { - CellErrorType::NA => Ok(ArrowDataType::Null), - _ => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()), - }, - CalData::Empty => Ok(ArrowDataType::Null), - } -} - -static FLOAT_TYPES_CELL: OnceLock> = OnceLock::new(); -static INT_TYPES_CELL: OnceLock> = OnceLock::new(); -static STRING_TYPES_CELL: OnceLock> = OnceLock::new(); - -fn float_types() -> &'static HashSet { - FLOAT_TYPES_CELL.get_or_init(|| { - HashSet::from([ - ArrowDataType::Int64, - ArrowDataType::Float64, - ArrowDataType::Boolean, - ]) - }) -} - -fn int_types() -> &'static HashSet { - INT_TYPES_CELL.get_or_init(|| HashSet::from([ArrowDataType::Int64, ArrowDataType::Boolean])) -} - -fn string_types() -> &'static HashSet { - STRING_TYPES_CELL.get_or_init(|| { - HashSet::from([ - ArrowDataType::Int64, - ArrowDataType::Float64, - ArrowDataType::Utf8, - ]) - }) -} - -fn get_arrow_column_type( - data: &Range, - start_row: usize, - end_row: usize, - col: usize, -) -> FastExcelResult { - let mut column_types = (start_row..end_row) - .map(|row| get_cell_type(data, row, col)) - .collect::>>()?; - - // All columns are nullable anyway so we're not taking Null into account here - column_types.remove(&ArrowDataType::Null); - - if column_types.is_empty() { - // If no type apart from NULL was found, it's a NULL column - Ok(ArrowDataType::Null) - } else if column_types.len() == 1 { - // If a single non-null type was found, return it - Ok(column_types.into_iter().next().unwrap()) - } else if column_types.is_subset(int_types()) { - // If every cell in the column can be converted to an int, return int64 - Ok(ArrowDataType::Int64) - } else if column_types.is_subset(float_types()) { - // If every cell in the column can be converted to a float, return Float64 - Ok(ArrowDataType::Float64) - } else if column_types.is_subset(string_types()) { - // If every cell in the column can be converted to a string, return Utf8 - Ok(ArrowDataType::Utf8) - } else { - // NOTE: Not being too smart about multi-types columns for now - Err( - FastExcelErrorKind::UnsupportedColumnTypeCombination(format!("{column_types:?}")) - .into(), - ) - } -} - -pub(crate) fn alias_for_name(name: &str, existing_names: &[String]) -> String { - fn rec(name: &str, existing_names: &[String], depth: usize) -> String { - let alias = if depth == 0 { - name.to_owned() - } else { - format!("{name}_{depth}") - }; - match existing_names - .iter() - .any(|existing_name| existing_name == &alias) - { - true => rec(name, existing_names, depth + 1), - false => alias, - } - } - - rec(name, existing_names, 0) -} - -pub(crate) fn arrow_schema_from_column_names_and_range( - range: &Range, - column_names: &[String], - row_idx: usize, - row_limit: usize, - selected_columns: &SelectedColumns, - dtypes: Option<&DTypeMap>, -) -> FastExcelResult { - // clippy suggests to split this type annotation into type declaration, but that would make it - // less clear IMO - #[allow(clippy::type_complexity)] - let arrow_type_for_column: Box FastExcelResult> = - match selected_columns { - // In case all columns are selected, we look up the dtype for the column by name, - // fallback on a lookup by index, and finally on get_arrow_column_type - SelectedColumns::All => Box::new(|col_idx, col_name| match dtypes { - None => get_arrow_column_type(range, row_idx, row_limit, col_idx), - Some(dts) => { - if let Some(dtype_by_name) = dts.dtype_for_col_name(col_name) { - Ok(dtype_by_name.into()) - } else if let Some(dtype_by_idx) = dts.dtype_for_col_idx(col_idx) { - Ok(dtype_by_idx.into()) - } else { - get_arrow_column_type(range, row_idx, row_limit, col_idx) - } - } - }), - // If columns are selected by name, look up the dtype by name and fallback on - // get_arrow_column_type - SelectedColumns::ByName(_) => Box::new(|col_idx, col_name| { - dtypes - .and_then(|dtypes| dtypes.dtype_for_col_name(col_name)) - .map(|dtype| Ok(dtype.into())) - .unwrap_or_else(|| get_arrow_column_type(range, row_idx, row_limit, col_idx)) - }), - - // If columns are selected by index, look up the dtype by name and fallback on - // get_arrow_column_type - SelectedColumns::ByIndex(_) => Box::new(|col_idx, _col_name| { - dtypes - .and_then(|dtypes| dtypes.dtype_for_col_idx(col_idx)) - .map(|dtype| Ok(dtype.into())) - .unwrap_or_else(|| get_arrow_column_type(range, row_idx, row_limit, col_idx)) - }), - }; - - let mut fields = Vec::with_capacity(column_names.len()); - let mut existing_names = Vec::with_capacity(column_names.len()); - - for (idx, name) in column_names.iter().enumerate() { - // If we have an index for the given column, extract it and add it to the schema. Otherwise, - // just ignore it - if let Some(col_idx) = match selected_columns { - SelectedColumns::All => Some(idx), - _ => selected_columns.idx_for_column(column_names, name, idx), - } { - let col_type = arrow_type_for_column(col_idx, name)?; - let aliased_name = alias_for_name(name, &existing_names); - fields.push(Field::new(&aliased_name, col_type, true)); - existing_names.push(aliased_name); - } - } - - Ok(Schema::new(fields)) -} - -#[cfg(test)] -mod tests { - use calamine::Cell; - use rstest::{fixture, rstest}; - - use super::*; - - #[fixture] - fn range() -> Range { - Range::from_sparse(vec![ - // First column - Cell::new((0, 0), CalData::Bool(true)), - Cell::new((1, 0), CalData::Bool(false)), - Cell::new((2, 0), CalData::String("NULL".to_string())), - Cell::new((3, 0), CalData::Int(42)), - Cell::new((4, 0), CalData::Float(13.37)), - Cell::new((5, 0), CalData::String("hello".to_string())), - Cell::new((6, 0), CalData::Empty), - Cell::new((7, 0), CalData::String("#N/A".to_string())), - Cell::new((8, 0), CalData::Int(12)), - Cell::new((9, 0), CalData::Float(12.21)), - Cell::new((10, 0), CalData::Bool(true)), - Cell::new((11, 0), CalData::Int(1337)), - ]) - } - - #[rstest] - // pure bool - #[case(0, 2, ArrowDataType::Boolean)] - // pure int - #[case(3, 4, ArrowDataType::Int64)] - // pure float - #[case(4, 5, ArrowDataType::Float64)] - // pure string - #[case(5, 6, ArrowDataType::Utf8)] - // pure int + float - #[case(3, 5, ArrowDataType::Float64)] - // null + int + float - #[case(2, 5, ArrowDataType::Float64)] - // float + string - #[case(4, 6, ArrowDataType::Utf8)] - // int + float + string - #[case(3, 6, ArrowDataType::Utf8)] - // null + int + float + string + empty + null - #[case(2, 8, ArrowDataType::Utf8)] - // empty + null + int - #[case(6, 9, ArrowDataType::Int64)] - // int + float + null - #[case(7, 10, ArrowDataType::Float64)] - // int + float + bool + null - #[case(7, 11, ArrowDataType::Float64)] - // int + bool - #[case(10, 12, ArrowDataType::Int64)] - fn get_arrow_column_type_multi_dtype_ok( - range: Range, - #[case] start_row: usize, - #[case] end_row: usize, - #[case] expected: ArrowDataType, - ) { - assert_eq!( - get_arrow_column_type(&range, start_row, end_row, 0).unwrap(), - expected - ); - } -} diff --git a/src/utils/mod.rs b/src/utils/mod.rs deleted file mode 100644 index 0e9f34f..0000000 --- a/src/utils/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub(crate) mod arrow;