From f3c92d65189d66d317cf0dc834b4efef23009a4a Mon Sep 17 00:00:00 2001
From: Luka Peschke <luka.peschke@toucantoco.com>
Date: Mon, 25 Mar 2024 09:37:02 +0100
Subject: [PATCH] feat: support mixed indices and column names for dtypes and
 column selection (#206)

* refactor: moved excelsheet to a subdirectory

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* feat: support mixed indices and column names for dtypes and column selection

Work items:

* Introduced a new type providing information about a column: `ColumnInfo`
* `ExcelSheet.selected_columns` now returns a list of ColumnInfo for all
  selected columns
* `ExcelSheet.available_columns` now returns a list of ColumnInfo for all
  available columns
* A lot of to/from python code has been wrapped in traits to have less
  manual validation

closes #198

---------

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
---
 python/fastexcel/__init__.py                  |  30 +-
 python/fastexcel/_fastexcel.pyi               |  34 +-
 python/tests/test_alias_generation.py         |   6 +-
 python/tests/test_column_selection.py         | 123 ++++--
 python/tests/test_dtypes.py                   |   2 +-
 src/lib.rs                                    |   4 +-
 src/types/dtype.rs                            | 269 ++++++++----
 src/types/idx_or_name.rs                      |  45 +-
 src/types/python/excelreader.rs               |  15 +-
 src/types/python/excelsheet/column_info.rs    | 273 ++++++++++++
 .../{excelsheet.rs => excelsheet/mod.rs}      | 390 +++++++++---------
 src/utils/arrow.rs                            | 270 ------------
 src/utils/mod.rs                              |   1 -
 13 files changed, 848 insertions(+), 614 deletions(-)
 create mode 100644 src/types/python/excelsheet/column_info.rs
 rename src/types/python/{excelsheet.rs => excelsheet/mod.rs} (70%)
 delete mode 100644 src/utils/arrow.rs
 delete mode 100644 src/utils/mod.rs

diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
index 749939a..f8e2c33 100644
--- a/python/fastexcel/__init__.py
+++ b/python/fastexcel/__init__.py
@@ -18,6 +18,7 @@
     CalamineCellError,
     CalamineError,
     CannotRetrieveCellDataError,
+    ColumnInfo,
     ColumnNotFoundError,
     FastExcelError,
     InvalidParametersError,
@@ -30,7 +31,9 @@
 from ._fastexcel import read_excel as _read_excel
 
 DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
-DTypeMap: TypeAlias = "dict[str, DType] | dict[int, DType]"
+DTypeMap: TypeAlias = "dict[str | int, DType]"
+ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"]
+DTypeFrom: TypeAlias = Literal["provided_by_index", "provided_by_name", "guessed"]
 
 
 class ExcelSheet:
@@ -60,12 +63,12 @@ def total_height(self) -> int:
         return self._sheet.total_height
 
     @property
-    def selected_columns(self) -> list[str] | list[int] | None:
+    def selected_columns(self) -> list[ColumnInfo]:
         """The sheet's selected columns"""
         return self._sheet.selected_columns
 
     @property
-    def available_columns(self) -> list[str]:
+    def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given sheet"""
         return self._sheet.available_columns
 
@@ -141,13 +144,12 @@ def load_sheet(
                                    If `None`, all rows will be used.
         :param use_columns: Specifies the columns to use. Can either be:
                             - `None` to select all columns
-                            - a list of strings, the column names
-                            - a list of ints, the column indices (starting at 0)
-                            - a string, a comma separated list of Excel column letters and column
+                            - A list of strings and ints, the column names and/or indices
+                              (starting at 0)
+                            - A string, a comma separated list of Excel column letters and column
                               ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
                               `A,B,C,D,E` and `A,C,E,F`)
-        :param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns`
-                       is a list of ints or an Excel range), or column names
+        :param dtypes: An optional dict of dtypes. Keys can be column indices or names
         """
         return ExcelSheet(
             self._reader.load_sheet(
@@ -235,10 +237,22 @@ def read_excel(source: Path | str | bytes) -> ExcelReader:
 
 
 __all__ = (
+    ## version
     "__version__",
+    ## main entrypoint
     "read_excel",
+    ## Python types
+    "DType",
+    "DTypeMap",
+    # Excel reader
     "ExcelReader",
+    # Excel sheet
     "ExcelSheet",
+    # Column metadata
+    "DTypeFrom",
+    "ColumnNameFrom",
+    "ColumnInfo",
+    # Exceptions
     "FastExcelError",
     "CannotRetrieveCellDataError",
     "CalamineCellError",
diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
index 421c4c3..0d6d3e1 100644
--- a/python/fastexcel/_fastexcel.pyi
+++ b/python/fastexcel/_fastexcel.pyi
@@ -4,9 +4,31 @@ from typing import Literal
 
 import pyarrow as pa
 
-_DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
+DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
+DTypeMap = dict[str | int, DType]
+ColumnNameFrom = Literal["provided", "looked_up", "generated"]
+DTypeFrom = Literal["provided_by_index", "provided_by_name", "guessed"]
 
-_DTypeMap = dict[str, _DType] | dict[int, _DType]
+class ColumnInfo:
+    def __init__(
+        self,
+        *,
+        name: str,
+        index: int,
+        column_name_from: ColumnNameFrom,
+        dtype: DType,
+        dtype_from: DTypeFrom,
+    ) -> None: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def index(self) -> int: ...
+    @property
+    def dtype(self) -> DType: ...
+    @property
+    def column_name_from(self) -> ColumnNameFrom: ...
+    @property
+    def dtype_from(self) -> DTypeFrom: ...
 
 class _ExcelSheet:
     @property
@@ -25,13 +47,13 @@ class _ExcelSheet:
     def offset(self) -> int:
         """The sheet's offset before data starts"""
     @property
-    def selected_columns(self) -> list[str] | list[int] | None:
+    def selected_columns(self) -> list[ColumnInfo]:
         """The sheet's selected columns"""
     @property
-    def available_columns(self) -> list[str]:
+    def available_columns(self) -> list[ColumnInfo]:
         """The columns available for the given sheet"""
     @property
-    def specified_dtypes(self) -> _DTypeMap | None:
+    def specified_dtypes(self) -> DTypeMap | None:
         """The dtypes specified for the sheet"""
     def to_arrow(self) -> pa.RecordBatch:
         """Converts the sheet to a pyarrow `RecordBatch`"""
@@ -49,7 +71,7 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
-        dtypes: _DTypeMap | None = None,
+        dtypes: DTypeMap | None = None,
     ) -> _ExcelSheet: ...
     @property
     def sheet_names(self) -> list[str]: ...
diff --git a/python/tests/test_alias_generation.py b/python/tests/test_alias_generation.py
index 562dd3b..728c73a 100644
--- a/python/tests/test_alias_generation.py
+++ b/python/tests/test_alias_generation.py
@@ -9,14 +9,16 @@
 from utils import path_for_fixture
 
 
-@pytest.mark.parametrize("use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"]])
+@pytest.mark.parametrize(
+    "use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"], [0, "col_1", 2]]
+)
 def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None:
     excel_reader = fastexcel.read_excel(
         path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx")
     )
 
     sheet = excel_reader.load_sheet(0, use_columns=use_columns)
-    assert sheet.available_columns == ["col", "col_1", "col_2"]
+    assert [col.name for col in sheet.available_columns] == ["col", "col_1", "col_2"]
 
     pd_assert_frame_equal(
         sheet.to_pandas(),
diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py
index a4cade3..ce1dca5 100644
--- a/python/tests/test_column_selection.py
+++ b/python/tests/test_column_selection.py
@@ -17,12 +17,27 @@ def excel_reader_single_sheet() -> fastexcel.ExcelReader:
     return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
 
 
-def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
+@pytest.fixture
+def expected_column_info() -> list[fastexcel.ColumnInfo]:
+    return [
+        fastexcel.ColumnInfo(
+            name="Month", index=0, column_name_from="looked_up", dtype="float", dtype_from="guessed"
+        ),
+        fastexcel.ColumnInfo(
+            name="Year", index=1, column_name_from="looked_up", dtype="float", dtype_from="guessed"
+        ),
+    ]
+
+
+def test_single_sheet_all_columns(
+    excel_reader_single_sheet: fastexcel.ExcelReader,
+    expected_column_info: list[fastexcel.ColumnInfo],
+) -> None:
     sheet = excel_reader_single_sheet.load_sheet(0)
 
     sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None)
-    assert sheet.selected_columns is None
-    assert sheet.available_columns == ["Month", "Year"]
+    assert sheet.selected_columns == expected_column_info
+    assert sheet.available_columns == expected_column_info
 
     expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
     expected_pd_df = pd.DataFrame(expected)
@@ -39,16 +54,19 @@ def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelRead
     pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df)
 
 
-def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
+def test_single_sheet_subset_by_str(
+    excel_reader_single_sheet: fastexcel.ExcelReader,
+    expected_column_info: list[fastexcel.ColumnInfo],
+) -> None:
     expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
 
     # looks like mypy 1.8 became more stupid
     sheets: list[str | int] = [0, "January"]
     for sheet_name_or_idx in sheets:
-        for col in ["Month", "Year"]:
+        for idx, col in enumerate(["Month", "Year"]):
             sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col])
-            assert sheet.selected_columns == [col]
-            assert sheet.available_columns == ["Month", "Year"]
+            assert sheet.selected_columns == [expected_column_info[idx]]
+            assert sheet.available_columns == expected_column_info
 
             pd_df = sheet.to_pandas()
             pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]}))
@@ -57,15 +75,18 @@ def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelRe
             pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]}))
 
 
-def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
+def test_single_sheet_subset_by_index(
+    excel_reader_single_sheet: fastexcel.ExcelReader,
+    expected_column_info: list[fastexcel.ColumnInfo],
+) -> None:
     expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
 
     sheets: list[str | int] = [0, "January"]
     for sheet_name_or_idx in sheets:
         for idx, col_name in enumerate(["Month", "Year"]):
             sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx])
-            assert sheet.selected_columns == [idx]
-            assert sheet.available_columns == ["Month", "Year"]
+            assert sheet.selected_columns == [expected_column_info[idx]]
+            assert sheet.available_columns == expected_column_info
 
             pd_df = sheet.to_pandas()
             pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]}))
@@ -90,9 +111,39 @@ def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]:
     }
 
 
+@pytest.fixture
+def sheet_with_unnamed_columns_expected_column_info() -> list[fastexcel.ColumnInfo]:
+    return [
+        fastexcel.ColumnInfo(
+            name="col1", index=0, column_name_from="looked_up", dtype="float", dtype_from="guessed"
+        ),
+        fastexcel.ColumnInfo(
+            name="__UNNAMED__1",
+            index=1,
+            column_name_from="generated",
+            dtype="float",
+            dtype_from="guessed",
+        ),
+        fastexcel.ColumnInfo(
+            name="col3", index=2, column_name_from="looked_up", dtype="string", dtype_from="guessed"
+        ),
+        fastexcel.ColumnInfo(
+            name="__UNNAMED__3",
+            index=3,
+            column_name_from="generated",
+            dtype="float",
+            dtype_from="guessed",
+        ),
+        fastexcel.ColumnInfo(
+            name="col5", index=4, column_name_from="looked_up", dtype="string", dtype_from="guessed"
+        ),
+    ]
+
+
 def test_single_sheet_with_unnamed_columns(
     excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
     single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
+    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
 ) -> None:
     use_columns_str = ["col1", "col3", "__UNNAMED__3"]
     use_columns_idx = [0, 2, 3]
@@ -103,8 +154,12 @@ def test_single_sheet_with_unnamed_columns(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str
     )
-    assert sheet.selected_columns == use_columns_str
-    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    assert sheet.selected_columns == [
+        sheet_with_unnamed_columns_expected_column_info[0],
+        sheet_with_unnamed_columns_expected_column_info[2],
+        sheet_with_unnamed_columns_expected_column_info[3],
+    ]
+    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -112,8 +167,12 @@ def test_single_sheet_with_unnamed_columns(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx
     )
-    assert sheet.selected_columns == use_columns_idx
-    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    assert sheet.selected_columns == [
+        sheet_with_unnamed_columns_expected_column_info[0],
+        sheet_with_unnamed_columns_expected_column_info[2],
+        sheet_with_unnamed_columns_expected_column_info[3],
+    ]
+    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -122,6 +181,7 @@ def test_single_sheet_with_unnamed_columns(
 def test_single_sheet_with_unnamed_columns_and_pagination(
     excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
     single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
+    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
 ) -> None:
     use_columns_str = ["col1", "col3", "__UNNAMED__3"]
     use_columns_idx = [0, 2, 3]
@@ -136,7 +196,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str, n_rows=1
     )
-    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -144,7 +204,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, n_rows=1
     )
-    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -159,7 +219,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str, skip_rows=1
     )
-    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -167,7 +227,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, skip_rows=1
     )
-    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -189,7 +249,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names
     )
-    assert sheet.available_columns == column_names
+    assert [col.name for col in sheet.available_columns] == column_names
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -197,7 +257,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
     )
-    assert sheet.available_columns == column_names
+    assert [col.name for col in sheet.available_columns] == column_names
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@@ -208,7 +268,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names
     )
-    assert sheet.available_columns == column_names
+    assert [col.name for col in sheet.available_columns] == column_names
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
@@ -216,7 +276,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
     )
-    assert sheet.available_columns == column_names
+    assert [col.name for col in sheet.available_columns] == column_names
 
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
@@ -225,9 +285,9 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
 def test_single_sheet_with_unnamed_columns_and_str_range(
     excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
     single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
+    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
 ) -> None:
     use_columns_str = "A,C:E"
-    use_columns_idx = [0, 2, 3, 4]
     expected = {
         k: v
         for k, v in single_sheet_with_unnamed_columns_expected.items()
@@ -236,8 +296,11 @@ def test_single_sheet_with_unnamed_columns_and_str_range(
     sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
         "With unnamed columns", use_columns=use_columns_str
     )
-    assert sheet.selected_columns == use_columns_idx
-    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    assert sheet.selected_columns == (
+        sheet_with_unnamed_columns_expected_column_info[:1]
+        + sheet_with_unnamed_columns_expected_column_info[2:]
+    )
+    assert sheet.available_columns == sheet_with_unnamed_columns_expected_column_info
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
 
@@ -269,11 +332,11 @@ def test_single_sheet_invalid_column_indices_empty_list(
 def test_single_sheet_invalid_column_indices_column_does_not_exist_str(
     excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
 ) -> None:
-    expected_message = """column with name "nope" not found
+    expected_message = """column with name \"nope\" not found
 Context:
-    0: selected columns are invalid, available columns are: ["Month", "Year"]
+    0: available columns are: .*
 """
-    with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
+    with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
         excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"])
 
 
@@ -282,7 +345,7 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
 ) -> None:
     expected_message = """column at index 42 not found
 Context:
-    0: selected columns are invalid, available columns are: ["Month", "Year"]
+    0: available columns are: .*
 """
-    with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
+    with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
         excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42])
diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py
index 82392ef..acc5c4e 100644
--- a/python/tests/test_dtypes.py
+++ b/python/tests/test_dtypes.py
@@ -130,7 +130,7 @@ def test_sheet_with_mixed_dtypes_specify_dtypes(
     expected_pd_dtype: str,
     expected_pl_dtype: pl.DataType,
 ) -> None:
-    dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}  # type:ignore[dict-item]
+    dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}
     excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
     sheet = excel_reader.load_sheet(0, dtypes=dtypes, n_rows=5)
     assert sheet.specified_dtypes == dtypes
diff --git a/src/lib.rs b/src/lib.rs
index befda90..3ae7070 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,10 +1,9 @@
 mod error;
 mod types;
-mod utils;
 
 use error::{py_errors, ErrorContext};
 use pyo3::prelude::*;
-use types::python::{ExcelReader, ExcelSheet};
+use types::python::{excelsheet::column_info::ColumnInfo, ExcelReader, ExcelSheet};
 
 /// Reads an excel file and returns an object allowing to access its sheets and a bit of metadata
 #[pyfunction]
@@ -41,6 +40,7 @@ fn get_version() -> String {
 #[pymodule]
 fn _fastexcel(py: Python, m: &PyModule) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(read_excel, m)?)?;
+    m.add_class::<ColumnInfo>()?;
     m.add_class::<ExcelSheet>()?;
     m.add_class::<ExcelReader>()?;
     m.add("__version__", get_version())?;
diff --git a/src/types/dtype.rs b/src/types/dtype.rs
index 1484bfd..90e7576 100644
--- a/src/types/dtype.rs
+++ b/src/types/dtype.rs
@@ -1,14 +1,18 @@
-use std::{collections::HashMap, str::FromStr};
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    sync::OnceLock,
+};
 
 use arrow::datatypes::{DataType as ArrowDataType, TimeUnit};
-use pyo3::{
-    types::{IntoPyDict, PyDict},
-    PyObject, Python, ToPyObject,
-};
+use calamine::{CellErrorType, Data as CalData, DataType, Range};
+use pyo3::{FromPyObject, PyAny, PyObject, PyResult, Python, ToPyObject};
+
+use crate::error::{py_errors::IntoPyResult, FastExcelError, FastExcelErrorKind, FastExcelResult};
 
-use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult};
+use super::idx_or_name::IdxOrName;
 
-#[derive(Debug)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)]
 pub(crate) enum DType {
     Null,
     Int,
@@ -41,8 +45,8 @@ impl FromStr for DType {
     }
 }
 
-impl ToPyObject for DType {
-    fn to_object(&self, py: Python<'_>) -> PyObject {
+impl ToString for DType {
+    fn to_string(&self) -> String {
         match self {
             DType::Null => "null",
             DType::Int => "int",
@@ -53,83 +57,32 @@ impl ToPyObject for DType {
             DType::Date => "date",
             DType::Duration => "duration",
         }
-        .to_object(py)
-    }
-}
-
-#[derive(Debug)]
-pub(crate) enum DTypeMap {
-    ByIndex(HashMap<usize, DType>),
-    ByName(HashMap<String, DType>),
-}
-
-impl DTypeMap {
-    pub(crate) fn dtype_for_col_name(&self, col_name: &String) -> Option<&DType> {
-        match self {
-            DTypeMap::ByName(name_map) => name_map.get(col_name),
-            _ => None,
-        }
-    }
-
-    pub(crate) fn dtype_for_col_idx(&self, col_idx: usize) -> Option<&DType> {
-        match self {
-            DTypeMap::ByIndex(idx_map) => idx_map.get(&col_idx),
-            _ => None,
-        }
-    }
-}
-
-impl<S: AsRef<str>> TryFrom<HashMap<usize, S>> for DTypeMap {
-    type Error = FastExcelError;
-
-    fn try_from(value: HashMap<usize, S>) -> FastExcelResult<Self> {
-        value
-            .into_iter()
-            .map(|(column, raw_dtype)| {
-                raw_dtype
-                    .as_ref()
-                    .parse()
-                    .map(|raw_dtype| (column, raw_dtype))
-            })
-            .collect::<FastExcelResult<HashMap<_, _>>>()
-            .map(Self::ByIndex)
+        .to_string()
     }
 }
 
-impl<S: AsRef<str>> TryFrom<HashMap<String, S>> for DTypeMap {
-    type Error = FastExcelError;
-
-    fn try_from(value: HashMap<String, S>) -> FastExcelResult<Self> {
-        value
-            .into_iter()
-            .map(|(column, raw_dtype)| {
-                raw_dtype
-                    .as_ref()
-                    .parse()
-                    .map(|raw_dtype| (column, raw_dtype))
-            })
-            .collect::<FastExcelResult<HashMap<_, _>>>()
-            .map(Self::ByName)
+impl ToPyObject for DType {
+    fn to_object(&self, py: Python<'_>) -> PyObject {
+        self.to_string().to_object(py)
     }
 }
 
-impl TryFrom<&PyDict> for DTypeMap {
-    type Error = FastExcelError;
-
-    fn try_from(py_dict: &PyDict) -> FastExcelResult<Self> {
-        if let Ok(string_map) = py_dict.extract::<HashMap<String, &str>>() {
-            string_map.try_into()
-        } else if let Ok(string_map) = py_dict.extract::<HashMap<usize, &str>>() {
-            string_map.try_into()
+impl FromPyObject<'_> for DType {
+    fn extract(py_dtype: &PyAny) -> PyResult<Self> {
+        if let Ok(dtype_str) = py_dtype.extract::<&str>() {
+            dtype_str.parse()
         } else {
             Err(FastExcelErrorKind::InvalidParameters(format!(
-                "unsupported dtype map: {py_dict:?}"
+                "{py_dtype:?} cannot be converted to str"
             ))
             .into())
         }
+        .into_pyresult()
     }
 }
 
+pub(crate) type DTypeMap = HashMap<IdxOrName, DType>;
+
 impl From<&DType> for ArrowDataType {
     fn from(dtype: &DType) -> Self {
         match dtype {
@@ -145,20 +98,166 @@ impl From<&DType> for ArrowDataType {
     }
 }
 
-impl ToPyObject for DTypeMap {
-    fn to_object(&self, py: Python<'_>) -> PyObject {
-        match self {
-            DTypeMap::ByIndex(idx_map) => idx_map
-                .iter()
-                .map(|(k, v)| (k, v.to_object(py)))
-                .into_py_dict(py)
-                .into(),
+/// All the possible string values that should be considered as NULL
+const NULL_STRING_VALUES: [&str; 19] = [
+    "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
+    "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null",
+];
+
+fn get_cell_dtype(data: &Range<CalData>, row: usize, col: usize) -> FastExcelResult<DType> {
+    let cell = data
+        .get((row, col))
+        .ok_or_else(|| FastExcelErrorKind::CannotRetrieveCellData(row, col))?;
+
+    match cell {
+        CalData::Int(_) => Ok(DType::Int),
+        CalData::Float(_) => Ok(DType::Float),
+        CalData::String(v) => match v {
+            v if NULL_STRING_VALUES.contains(&v.as_str()) => Ok(DType::Null),
+            _ => Ok(DType::String),
+        },
+        CalData::Bool(_) => Ok(DType::Bool),
+        // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be
+        // a duration or a datatime
+        CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() {
+            DType::DateTime
+        } else {
+            DType::Duration
+        }),
+        // These types contain an ISO8601 representation of a date/datetime or a duration
+        CalData::DateTimeIso(_) => match cell.as_datetime() {
+            Some(_) => Ok(DType::DateTime),
+            // If we cannot convert the cell to a datetime, we're working on a date
+            // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime
+            // rather than a date
+            None => Ok(DType::Date),
+        },
+        // A simple duration
+        CalData::DurationIso(_) => Ok(DType::Duration),
+        // Errors and nulls
+        CalData::Error(err) => match err {
+            CellErrorType::NA => Ok(DType::Null),
+            _ => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()),
+        },
+        CalData::Empty => Ok(DType::Null),
+    }
+}
+
+static FLOAT_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
+static INT_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
+static STRING_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
+
+fn float_types() -> &'static HashSet<DType> {
+    FLOAT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::Bool]))
+}
+
+fn int_types() -> &'static HashSet<DType> {
+    INT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Bool]))
+}
 
-            DTypeMap::ByName(name_map) => name_map
-                .iter()
-                .map(|(k, v)| (k, v.to_object(py)))
-                .into_py_dict(py)
+fn string_types() -> &'static HashSet<DType> {
+    STRING_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::String]))
+}
+
+pub(crate) fn get_dtype_for_column(
+    data: &Range<CalData>,
+    start_row: usize,
+    end_row: usize,
+    col: usize,
+) -> FastExcelResult<DType> {
+    let mut column_types = (start_row..end_row)
+        .map(|row| get_cell_dtype(data, row, col))
+        .collect::<FastExcelResult<HashSet<_>>>()?;
+
+    // All columns are nullable anyway so we're not taking Null into account here
+    column_types.remove(&DType::Null);
+
+    if column_types.is_empty() {
+        // If no type apart from NULL was found, it's a NULL column
+        Ok(DType::Null)
+    } else if column_types.len() == 1 {
+        // If a single non-null type was found, return it
+        Ok(column_types.into_iter().next().unwrap())
+    } else if column_types.is_subset(int_types()) {
+        // If every cell in the column can be converted to an int, return int64
+        Ok(DType::Int)
+    } else if column_types.is_subset(float_types()) {
+        // If every cell in the column can be converted to a float, return Float64
+        Ok(DType::Float)
+    } else if column_types.is_subset(string_types()) {
+        // If every cell in the column can be converted to a string, return Utf8
+        Ok(DType::String)
+    } else {
+        // NOTE: Not being too smart about multi-types columns for now
+        Err(
+            FastExcelErrorKind::UnsupportedColumnTypeCombination(format!("{column_types:?}"))
                 .into(),
-        }
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use calamine::Cell;
+    use rstest::{fixture, rstest};
+
+    use super::*;
+
+    #[fixture]
+    fn range() -> Range<CalData> {
+        Range::from_sparse(vec![
+            // First column
+            Cell::new((0, 0), CalData::Bool(true)),
+            Cell::new((1, 0), CalData::Bool(false)),
+            Cell::new((2, 0), CalData::String("NULL".to_string())),
+            Cell::new((3, 0), CalData::Int(42)),
+            Cell::new((4, 0), CalData::Float(13.37)),
+            Cell::new((5, 0), CalData::String("hello".to_string())),
+            Cell::new((6, 0), CalData::Empty),
+            Cell::new((7, 0), CalData::String("#N/A".to_string())),
+            Cell::new((8, 0), CalData::Int(12)),
+            Cell::new((9, 0), CalData::Float(12.21)),
+            Cell::new((10, 0), CalData::Bool(true)),
+            Cell::new((11, 0), CalData::Int(1337)),
+        ])
+    }
+
+    #[rstest]
+    // pure bool
+    #[case(0, 2, DType::Bool)]
+    // pure int
+    #[case(3, 4, DType::Int)]
+    // pure float
+    #[case(4, 5, DType::Float)]
+    // pure string
+    #[case(5, 6, DType::String)]
+    // pure int + float
+    #[case(3, 5, DType::Float)]
+    // null + int + float
+    #[case(2, 5, DType::Float)]
+    // float + string
+    #[case(4, 6, DType::String)]
+    // int + float + string
+    #[case(3, 6, DType::String)]
+    // null + int + float + string + empty + null
+    #[case(2, 8, DType::String)]
+    // empty + null + int
+    #[case(6, 9, DType::Int)]
+    // int + float + null
+    #[case(7, 10, DType::Float)]
+    // int + float + bool + null
+    #[case(7, 11, DType::Float)]
+    // int + bool
+    #[case(10, 12, DType::Int)]
+    fn get_arrow_column_type_multi_dtype_ok(
+        range: Range<CalData>,
+        #[case] start_row: usize,
+        #[case] end_row: usize,
+        #[case] expected: DType,
+    ) {
+        assert_eq!(
+            get_dtype_for_column(&range, start_row, end_row, 0).unwrap(),
+            expected
+        );
     }
 }
diff --git a/src/types/idx_or_name.rs b/src/types/idx_or_name.rs
index 503c53d..6788530 100644
--- a/src/types/idx_or_name.rs
+++ b/src/types/idx_or_name.rs
@@ -1,8 +1,8 @@
-use pyo3::PyAny;
+use pyo3::{FromPyObject, PyAny, PyObject, PyResult, Python, ToPyObject};
 
-use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult};
+use crate::error::{py_errors::IntoPyResult, FastExcelError, FastExcelErrorKind, FastExcelResult};
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
 pub(crate) enum IdxOrName {
     Idx(usize),
     Name(String),
@@ -20,16 +20,43 @@ impl IdxOrName {
 impl TryFrom<&PyAny> for IdxOrName {
     type Error = FastExcelError;
 
-    fn try_from(py_any: &PyAny) -> FastExcelResult<Self> {
-        if let Ok(name) = py_any.extract::<String>() {
-            Ok(IdxOrName::Name(name))
-        } else if let Ok(index) = py_any.extract::<usize>() {
-            Ok(IdxOrName::Idx(index))
+    fn try_from(value: &PyAny) -> FastExcelResult<Self> {
+        if let Ok(index) = value.extract() {
+            Ok(Self::Idx(index))
+        } else if let Ok(name) = value.extract() {
+            Ok(Self::Name(name))
         } else {
             Err(FastExcelErrorKind::InvalidParameters(format!(
-                "cannot create IdxOrName from {py_any:?}"
+                "cannot create IdxOrName from {value:?}"
             ))
             .into())
         }
     }
 }
+
+impl FromPyObject<'_> for IdxOrName {
+    fn extract(value: &PyAny) -> PyResult<Self> {
+        value.try_into().into_pyresult()
+    }
+}
+
+impl ToPyObject for IdxOrName {
+    fn to_object(&self, py: Python<'_>) -> PyObject {
+        match self {
+            IdxOrName::Idx(idx) => idx.to_object(py),
+            IdxOrName::Name(name) => name.to_object(py),
+        }
+    }
+}
+
+impl From<usize> for IdxOrName {
+    fn from(index: usize) -> Self {
+        Self::Idx(index)
+    }
+}
+
+impl From<String> for IdxOrName {
+    fn from(name: String) -> Self {
+        Self::Name(name)
+    }
+}
diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs
index d53589c..8de50de 100644
--- a/src/types/python/excelreader.rs
+++ b/src/types/python/excelreader.rs
@@ -4,7 +4,7 @@ use std::{
 };
 
 use calamine::{open_workbook_auto, open_workbook_auto_from_rs, Data, Range, Reader, Sheets};
-use pyo3::{pyclass, pymethods, types::PyDict, PyAny, PyResult};
+use pyo3::{pyclass, pymethods, PyAny, PyResult};
 
 use crate::{
     error::{
@@ -62,14 +62,6 @@ impl ExcelReader {
         })
     }
 
-    fn build_dtypes(raw_dtypes: Option<&PyDict>) -> FastExcelResult<Option<DTypeMap>> {
-        match raw_dtypes {
-            None => Ok(None),
-            Some(py_dict) => py_dict.try_into().map(Some),
-        }
-        .with_context(|| "could not parse provided dtypes")
-    }
-
     fn build_selected_columns(use_columns: Option<&PyAny>) -> FastExcelResult<SelectedColumns> {
         use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}"))
     }
@@ -84,14 +76,13 @@ impl ExcelReader {
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
         use_columns: Option<&PyAny>,
-        dtypes: Option<&PyDict>,
+        dtypes: Option<DTypeMap>,
     ) -> FastExcelResult<ExcelSheet> {
         let range = self.sheets.worksheet_range(&name)?;
 
         let header = Header::new(header_row, column_names);
         let pagination = Pagination::new(skip_rows, n_rows, &range)?;
         let selected_columns = Self::build_selected_columns(use_columns)?;
-        let dtypes = Self::build_dtypes(dtypes)?;
         ExcelSheet::try_new(
             name,
             range,
@@ -148,7 +139,7 @@ impl ExcelReader {
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
         use_columns: Option<&PyAny>,
-        dtypes: Option<&PyDict>,
+        dtypes: Option<DTypeMap>,
     ) -> PyResult<ExcelSheet> {
         let name = idx_or_name
             .try_into()
diff --git a/src/types/python/excelsheet/column_info.rs b/src/types/python/excelsheet/column_info.rs
new file mode 100644
index 0000000..d26882e
--- /dev/null
+++ b/src/types/python/excelsheet/column_info.rs
@@ -0,0 +1,273 @@
+use std::{str::FromStr, usize};
+
+use calamine::{Data as CalData, Range};
+use pyo3::{pyclass, pymethods, PyResult};
+
+use crate::{
+    error::{
+        py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
+    },
+    types::{
+        dtype::{get_dtype_for_column, DType, DTypeMap},
+        idx_or_name::IdxOrName,
+    },
+};
+
+#[derive(Debug, Clone, PartialEq)]
+pub(crate) enum ColumnNameFrom {
+    Provided,
+    LookedUp,
+    Generated,
+}
+
+impl FromStr for ColumnNameFrom {
+    type Err = FastExcelError;
+
+    fn from_str(s: &str) -> FastExcelResult<Self> {
+        match s {
+            "provided" => Ok(Self::Provided),
+            "looked_up" => Ok(Self::LookedUp),
+            "generated" => Ok(Self::Generated),
+            _ => Err(
+                FastExcelErrorKind::InvalidParameters(format!("invalid ColumnNameFrom: {s}"))
+                    .into(),
+            ),
+        }
+    }
+}
+
+impl ToString for ColumnNameFrom {
+    fn to_string(&self) -> String {
+        match self {
+            ColumnNameFrom::Provided => "provided",
+            ColumnNameFrom::LookedUp => "looked_up",
+            ColumnNameFrom::Generated => "generated",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub(crate) enum DTypeFrom {
+    ProvidedByIndex,
+    ProvidedByName,
+    Guessed,
+}
+
+impl ToString for DTypeFrom {
+    fn to_string(&self) -> String {
+        match self {
+            DTypeFrom::ProvidedByIndex => "provided_by_index",
+            DTypeFrom::ProvidedByName => "provided_by_name",
+            DTypeFrom::Guessed => "guessed",
+        }
+        .to_string()
+    }
+}
+
+impl FromStr for DTypeFrom {
+    type Err = FastExcelError;
+
+    fn from_str(s: &str) -> FastExcelResult<Self> {
+        match s {
+            "provided_by_index" => Ok(Self::ProvidedByIndex),
+            "provided_by_name" => Ok(Self::ProvidedByName),
+            "guessed" => Ok(Self::Guessed),
+            _ => Err(
+                FastExcelErrorKind::InvalidParameters(format!("invalid DTypesFrom: {s}")).into(),
+            ),
+        }
+    }
+}
+
+// NOTE: The types for properties unfortunately do not appear in the docs for this class, so we had
+// to specify them via docstrings
+/// This class provides information about a single column in a sheet
+#[derive(Debug, Clone, PartialEq)]
+#[pyclass(name = "ColumnInfo")]
+pub(crate) struct ColumnInfo {
+    /// `str`. The name of the column
+    #[pyo3(get)]
+    name: String,
+    /// `int`. The index of the column
+    #[pyo3(get)]
+    index: usize,
+    dtype: DType,
+    column_name_from: ColumnNameFrom,
+    dtype_from: DTypeFrom,
+}
+
+impl ColumnInfo {
+    pub(crate) fn new(
+        name: String,
+        index: usize,
+        column_name_from: ColumnNameFrom,
+        dtype: DType,
+        dtype_from: DTypeFrom,
+    ) -> Self {
+        Self {
+            name,
+            index,
+            dtype,
+            column_name_from,
+            dtype_from,
+        }
+    }
+
+    pub(crate) fn name(&self) -> &str {
+        &self.name
+    }
+
+    pub(crate) fn index(&self) -> usize {
+        self.index
+    }
+
+    pub(crate) fn dtype(&self) -> &DType {
+        &self.dtype
+    }
+}
+
+#[pymethods]
+impl ColumnInfo {
+    /// Creates a new ColumnInfo object.
+    ///
+    /// - `name`: `str`. The name of the column
+    /// - `index`: `int`. The index of the column. Must be >=0
+    /// - `column_name_from`: `fastexcel.ColumnNameFrom`. The origin of the column name
+    /// - `dtype`: `fastexcel.DType`. The dtype of the column
+    /// - `dtype_from`: `fastexcel.DTypeFrom`. The origin of the dtype for the column
+    #[new]
+    pub(crate) fn py_new(
+        name: String,
+        index: usize,
+        column_name_from: &str,
+        dtype: &str,
+        dtype_from: &str,
+    ) -> PyResult<Self> {
+        Ok(Self::new(
+            name,
+            index,
+            column_name_from.parse().into_pyresult()?,
+            dtype.parse().into_pyresult()?,
+            dtype_from.parse().into_pyresult()?,
+        ))
+    }
+    /// `fastexcel.DType`. The dtype of the column
+    #[getter(dtype)]
+    fn get_dtype(&self) -> String {
+        self.dtype.to_string()
+    }
+
+    /// `fastexcel.ColumnNameFrom`. How the name of the column was determined.
+    ///
+    /// One of three possible values:
+    /// - `"provided"`: The column name was provided via the `use_columns` parameter
+    /// - `"looked_up"`: The column name was looked up from the data found in the sheet
+    /// - `"generated"`: The column name was generated from the column index, either because
+    ///                  `header_row` was `None`, or because it could not be looked up
+    #[getter(column_name_from)]
+    fn get_colum_name_from(&self) -> String {
+        self.column_name_from.to_string()
+    }
+
+    /// `fastexcel.DTypeFrom`. How the dtype of the column was determined.
+    ///
+    /// One of three possible values:
+    /// - `"provided_by_index"`: The dtype was specified via the column index
+    /// - `"provided_by_name"`: The dtype was specified via the column name
+    /// - `"guessed"`: The dtype was determined from the content of the column
+    #[getter(dtype_from)]
+    fn get_dtype_from(&self) -> String {
+        self.dtype_from.to_string()
+    }
+
+    pub fn __repr__(&self) -> String {
+        format!("ColumnInfo<name=\"{name}\", index={index}, dtype=\"{dtype}\", dtype_from=\"{dtype_from}\", column_name_from=\"{column_name_from}\" >", name=self.name, index=self.index, dtype=self.dtype.to_string(), dtype_from=self.dtype_from.to_string(), column_name_from=self.column_name_from.to_string())
+    }
+
+    pub fn __eq__(&self, other: &Self) -> bool {
+        self == other
+    }
+}
+
+#[derive(Debug)]
+pub(super) struct ColumnInfoBuilder {
+    name: String,
+    index: usize,
+    column_name_from: ColumnNameFrom,
+}
+
+// Allows us to easily compare ourselves to a column index or name
+impl PartialEq<IdxOrName> for ColumnInfoBuilder {
+    fn eq(&self, other: &IdxOrName) -> bool {
+        match other {
+            IdxOrName::Idx(index) => index == &self.index,
+            IdxOrName::Name(name) => name == &self.name,
+        }
+    }
+}
+
+impl ColumnInfoBuilder {
+    pub(super) fn new(name: String, index: usize, column_name_from: ColumnNameFrom) -> Self {
+        Self {
+            name,
+            index,
+            column_name_from,
+        }
+    }
+
+    pub(super) fn with_name(mut self, name: String) -> Self {
+        self.name = name;
+        self
+    }
+
+    pub(super) fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn dtype_info(
+        &self,
+        data: &Range<CalData>,
+        start_row: usize,
+        end_row: usize,
+        specified_dtypes: Option<&DTypeMap>,
+    ) -> FastExcelResult<(DType, DTypeFrom)> {
+        specified_dtypes
+            .and_then(|dtypes| {
+                // if we have dtypes, look the dtype up by index, and fall back on a lookup by name
+                // (done in this order because copying an usize is cheaper than cloning a string)
+                if let Some(dtype) = dtypes.get(&self.index.into()) {
+                    Some((*dtype, DTypeFrom::ProvidedByIndex))
+                } else {
+                    dtypes
+                        .get(&self.name.clone().into())
+                        .map(|dtype| (*dtype, DTypeFrom::ProvidedByName))
+                }
+            })
+            .map(FastExcelResult::Ok)
+            // If we could not look up a dtype, guess it from the data
+            .unwrap_or_else(|| {
+                get_dtype_for_column(data, start_row, end_row, self.index)
+                    .map(|dtype| (dtype, DTypeFrom::Guessed))
+            })
+    }
+
+    pub(super) fn finish(
+        self,
+        data: &Range<CalData>,
+        start_row: usize,
+        end_row: usize,
+        specified_dtypes: Option<&DTypeMap>,
+    ) -> FastExcelResult<ColumnInfo> {
+        let (dtype, dtype_from) = self
+            .dtype_info(data, start_row, end_row, specified_dtypes)
+            .with_context(|| format!("could not determine dtype for column {}", self.name))?;
+        Ok(ColumnInfo::new(
+            self.name,
+            self.index,
+            self.column_name_from,
+            dtype,
+            dtype_from,
+        ))
+    }
+}
diff --git a/src/types/python/excelsheet.rs b/src/types/python/excelsheet/mod.rs
similarity index 70%
rename from src/types/python/excelsheet.rs
rename to src/types/python/excelsheet/mod.rs
index e0b5fde..616a3ce 100644
--- a/src/types/python/excelsheet.rs
+++ b/src/types/python/excelsheet/mod.rs
@@ -1,11 +1,15 @@
+pub(crate) mod column_info;
+
 use std::{cmp, collections::HashSet, str::FromStr, sync::Arc};
 
 use crate::{
     error::{
         py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
     },
-    types::{dtype::DTypeMap, idx_or_name::IdxOrName},
-    utils::arrow::alias_for_name,
+    types::{
+        dtype::{DType, DTypeMap},
+        idx_or_name::IdxOrName,
+    },
 };
 
 use arrow::{
@@ -13,7 +17,7 @@ use arrow::{
         Array, BooleanArray, Date32Array, DurationMillisecondArray, Float64Array, Int64Array,
         NullArray, StringArray, TimestampMillisecondArray,
     },
-    datatypes::{DataType as ArrowDataType, Schema, TimeUnit},
+    datatypes::{Field, Schema},
     pyarrow::ToPyArrow,
     record_batch::RecordBatch,
 };
@@ -26,7 +30,9 @@ use pyo3::{
     PyAny, PyResult, ToPyObject,
 };
 
-use crate::utils::arrow::arrow_schema_from_column_names_and_range;
+// use crate::utils::arrow::arrow_schema_from_column_names_and_range;
+
+use self::column_info::{ColumnInfo, ColumnInfoBuilder, ColumnNameFrom};
 
 #[derive(Debug)]
 pub(crate) enum Header {
@@ -81,77 +87,6 @@ impl Pagination {
         self.skip_rows
     }
 }
-
-#[derive(Debug, PartialEq)]
-pub(crate) enum SelectedColumns {
-    All,
-    ByIndex(Vec<usize>),
-    ByName(Vec<String>),
-}
-
-impl SelectedColumns {
-    pub(crate) fn validate_columns(&self, column_names: &[String]) -> FastExcelResult<()> {
-        match self {
-            SelectedColumns::All => Ok(()),
-            // If no selected indice is >= to the len of column_names, we're good
-            SelectedColumns::ByIndex(indices) => indices.iter().try_for_each(|idx| {
-                if idx >= &column_names.len() {
-                    Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Idx(*idx)).into())
-                } else {
-                    Ok(())
-                }
-            }),
-            // Every selected column must be in the provided column_names
-            SelectedColumns::ByName(selected_names) => {
-                selected_names.iter().try_for_each(|selected_name| {
-                    if column_names.contains(selected_name) {
-                        Ok(())
-                    } else {
-                        Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Name(
-                            selected_name.to_string(),
-                        ))
-                        .into())
-                    }
-                })
-            }
-        }
-    }
-
-    pub(crate) fn idx_for_column(
-        &self,
-        col_names: &[String],
-        col_name: &str,
-        col_idx: usize,
-    ) -> Option<usize> {
-        match self {
-            SelectedColumns::All => None,
-            SelectedColumns::ByIndex(indices) => {
-                if indices.contains(&col_idx) {
-                    Some(col_idx)
-                } else {
-                    None
-                }
-            }
-            SelectedColumns::ByName(names) => {
-                // cannot use .contains() because we have &String and &str
-                if names.iter().any(|name| name == col_name) {
-                    col_names.iter().position(|name| name == col_name)
-                } else {
-                    None
-                }
-            }
-        }
-    }
-
-    pub(crate) fn to_python<'p>(&self, py: Python<'p>) -> Option<&'p PyList> {
-        match self {
-            SelectedColumns::All => None,
-            SelectedColumns::ByIndex(idx_vec) => Some(PyList::new(py, idx_vec)),
-            SelectedColumns::ByName(name_vec) => Some(PyList::new(py, name_vec)),
-        }
-    }
-}
-
 impl TryFrom<&PyList> for SelectedColumns {
     type Error = FastExcelError;
 
@@ -160,10 +95,8 @@ impl TryFrom<&PyList> for SelectedColumns {
 
         if py_list.is_empty() {
             Err(InvalidParameters("list of selected columns is empty".to_string()).into())
-        } else if let Ok(name_vec) = py_list.extract::<Vec<String>>() {
-            Ok(Self::ByName(name_vec))
-        } else if let Ok(index_vec) = py_list.extract::<Vec<usize>>() {
-            Ok(Self::ByIndex(index_vec))
+        } else if let Ok(selection) = py_list.extract::<Vec<IdxOrName>>() {
+            Ok(Self::Selection(selection))
         } else {
             Err(
                 InvalidParameters(format!("expected list[int] | list[str], got {py_list:?}"))
@@ -173,7 +106,39 @@ impl TryFrom<&PyList> for SelectedColumns {
     }
 }
 
+#[derive(Debug, PartialEq)]
+pub(crate) enum SelectedColumns {
+    All,
+    Selection(Vec<IdxOrName>),
+}
+
 impl SelectedColumns {
+    pub(super) fn select_columns(
+        &self,
+        column_info: &[ColumnInfo],
+    ) -> FastExcelResult<Vec<ColumnInfo>> {
+        match self {
+            SelectedColumns::All => Ok(column_info.to_vec()),
+            SelectedColumns::Selection(selection) => selection
+                .iter()
+                .map(|selected_column| {
+                    match selected_column {
+                        IdxOrName::Idx(index) => column_info
+                            .iter()
+                            .find(|col_info| &col_info.index() == index),
+                        IdxOrName::Name(name) => column_info
+                            .iter()
+                            .find(|col_info| col_info.name() == name.as_str()),
+                    }
+                    .ok_or_else(|| {
+                        FastExcelErrorKind::ColumnNotFound(selected_column.clone()).into()
+                    })
+                    .map(Clone::clone)
+                    .with_context(|| format!("available columns are: {column_info:?}"))
+                })
+                .collect(),
+        }
+    }
     const ALPHABET: [char; 26] = [
         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
         'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
@@ -272,7 +237,9 @@ impl FromStr for SelectedColumns {
             .collect();
         let mut sorted_col_indices: Vec<usize> = unique_col_indices.into_iter().collect();
         sorted_col_indices.sort();
-        Ok(Self::ByIndex(sorted_col_indices))
+        Ok(Self::Selection(
+            sorted_col_indices.into_iter().map(IdxOrName::Idx).collect(),
+        ))
     }
 }
 
@@ -311,6 +278,25 @@ impl TryFrom<Option<&PyAny>> for SelectedColumns {
     }
 }
 
+fn alias_for_name(name: &str, existing_names: &[String]) -> String {
+    fn rec(name: &str, existing_names: &[String], depth: usize) -> String {
+        let alias = if depth == 0 {
+            name.to_owned()
+        } else {
+            format!("{name}_{depth}")
+        };
+        match existing_names
+            .iter()
+            .any(|existing_name| existing_name == &alias)
+        {
+            true => rec(name, existing_names, depth + 1),
+            false => alias,
+        }
+    }
+
+    rec(name, existing_names, 0)
+}
+
 #[pyclass(name = "_ExcelSheet")]
 pub(crate) struct ExcelSheet {
     #[pyo3(get)]
@@ -322,8 +308,9 @@ pub(crate) struct ExcelSheet {
     total_height: Option<usize>,
     width: Option<usize>,
     schema_sample_rows: Option<usize>,
-    selected_columns: SelectedColumns,
-    available_columns: Vec<String>,
+    // selected_columns: SelectedColumns,
+    selected_columns: Vec<ColumnInfo>,
+    available_columns: Vec<ColumnInfo>,
     dtypes: Option<DTypeMap>,
 }
 
@@ -342,12 +329,7 @@ impl ExcelSheet {
         dtypes: Option<DTypeMap>,
     ) -> FastExcelResult<Self> {
         // Ensuring dtypes are compatible with selected columns
-        match (&dtypes, &selected_columns) {
-            (None, _) | (_, SelectedColumns::All) => Ok::<(), FastExcelError>(()),
-            (Some(DTypeMap::ByIndex(_)), SelectedColumns::ByIndex(_)) => Ok(()),
-            (Some(DTypeMap::ByName(_)), SelectedColumns::ByName(_)) => Ok(()),
-            (Some(other), selected_columns) => Err(FastExcelErrorKind::InvalidParameters(format!("invalid dtypes and selected column combiantion, got \"{other:?}\" and \"{selected_columns:?}\"")).into())
-        }?;
+        // Self::validate_dtypes_and_selected_columns(&selected_columns, &dtypes)?;
 
         let mut sheet = ExcelSheet {
             name,
@@ -355,59 +337,94 @@ impl ExcelSheet {
             pagination,
             data,
             schema_sample_rows,
-            selected_columns,
             dtypes,
             height: None,
             total_height: None,
             width: None,
-            // an empty vec as it will be replaced
+            // Empty vecs as they'll be replaced
             available_columns: Vec::with_capacity(0),
+            selected_columns: Vec::with_capacity(0),
         };
 
-        let available_columns = sheet.get_available_columns();
+        let available_columns_info = sheet.get_available_columns_info();
 
-        let mut aliased_available_columns = Vec::with_capacity(available_columns.len());
+        let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len());
 
-        available_columns.iter().for_each(|column_name| {
-            aliased_available_columns.push(alias_for_name(column_name, &aliased_available_columns))
-        });
+        let dtype_sample_rows =
+            sheet.offset() + sheet.schema_sample_rows().unwrap_or(sheet.limit());
+        let row_limit = cmp::min(dtype_sample_rows, sheet.limit());
 
-        // Ensuring selected columns are valid
-        sheet
-            .selected_columns
-            .validate_columns(&aliased_available_columns)
-            .with_context(|| {
-                format!(
-                    "selected columns are invalid, available columns are: {available_columns:?}"
+        // Finalizing column info
+        let available_columns = available_columns_info
+            .into_iter()
+            .map(|mut column_info_builder| {
+                // Setting the right alias for every column
+                let alias = alias_for_name(column_info_builder.name(), &aliased_available_columns);
+                if alias != column_info_builder.name() {
+                    column_info_builder = column_info_builder.with_name(alias.clone());
+                }
+                aliased_available_columns.push(alias);
+                // Setting the dtype info
+                column_info_builder.finish(
+                    &sheet.data,
+                    sheet.offset(),
+                    row_limit,
+                    sheet.dtypes.as_ref(),
                 )
-            })?;
+            })
+            .collect::<FastExcelResult<Vec<_>>>()?;
+        let selected_columns = selected_columns.select_columns(&available_columns)?;
+        sheet.available_columns = available_columns;
+        sheet.selected_columns = selected_columns;
 
-        sheet.available_columns = aliased_available_columns;
+        // Figure out dtype for every column
         Ok(sheet)
     }
 
-    fn get_available_columns(&self) -> Vec<String> {
+    fn get_available_columns_info(&self) -> Vec<ColumnInfoBuilder> {
         let width = self.data.width();
         match &self.header {
             Header::None => (0..width)
-                .map(|col_idx| format!("__UNNAMED__{col_idx}"))
+                .map(|col_idx| {
+                    ColumnInfoBuilder::new(
+                        format!("__UNNAMED__{col_idx}"),
+                        col_idx,
+                        ColumnNameFrom::Generated,
+                    )
+                })
                 .collect(),
             Header::At(row_idx) => (0..width)
                 .map(|col_idx| {
                     self.data
                         .get((*row_idx, col_idx))
                         .and_then(|data| data.as_string())
-                        .unwrap_or(format!("__UNNAMED__{col_idx}"))
+                        .map(|col_name| {
+                            ColumnInfoBuilder::new(col_name, col_idx, ColumnNameFrom::LookedUp)
+                        })
+                        .unwrap_or_else(|| {
+                            ColumnInfoBuilder::new(
+                                format!("__UNNAMED__{col_idx}"),
+                                col_idx,
+                                ColumnNameFrom::Generated,
+                            )
+                        })
                 })
                 .collect(),
             Header::With(names) => {
                 let nameless_start_idx = names.len();
                 names
                     .iter()
-                    .map(ToOwned::to_owned)
-                    .chain(
-                        (nameless_start_idx..width).map(|col_idx| format!("__UNNAMED__{col_idx}")),
-                    )
+                    .enumerate()
+                    .map(|(col_idx, name)| {
+                        ColumnInfoBuilder::new(name.to_owned(), col_idx, ColumnNameFrom::Provided)
+                    })
+                    .chain((nameless_start_idx..width).map(|col_idx| {
+                        ColumnInfoBuilder::new(
+                            format!("__UNNAMED__{col_idx}"),
+                            col_idx,
+                            ColumnNameFrom::Generated,
+                        )
+                    }))
                     .collect()
             }
         }
@@ -533,23 +550,14 @@ fn create_duration_array(
     ))
 }
 
-impl TryFrom<&ExcelSheet> for Schema {
-    type Error = FastExcelError;
-
-    fn try_from(sheet: &ExcelSheet) -> Result<Self, Self::Error> {
-        // Checking how many rows we want to use to determine the dtype for a column. If sample_rows is
-        // not provided, we sample limit rows, i.e on the entire column
-        let sample_rows = sheet.offset() + sheet.schema_sample_rows().unwrap_or(sheet.limit());
-
-        arrow_schema_from_column_names_and_range(
-            sheet.data(),
-            &sheet.available_columns,
-            sheet.offset(),
-            // If sample_rows is higher than the sheet's limit, use the limit instead
-            cmp::min(sample_rows, sheet.limit()),
-            &sheet.selected_columns,
-            sheet.dtypes.as_ref(),
-        )
+impl From<&ExcelSheet> for Schema {
+    fn from(sheet: &ExcelSheet) -> Self {
+        let fields: Vec<_> = sheet
+            .selected_columns
+            .iter()
+            .map(|col_info| Field::new(col_info.name(), col_info.dtype().into(), true))
+            .collect();
+        Schema::new(fields)
     }
 }
 
@@ -560,65 +568,45 @@ impl TryFrom<&ExcelSheet> for RecordBatch {
         let offset = sheet.offset();
         let limit = sheet.limit();
 
-        let schema = Schema::try_from(sheet)
-            .with_context(|| format!("could not build schema for sheet {}", sheet.name))?;
-
         let mut iter = sheet
-            .available_columns
+            .selected_columns
             .iter()
-            .enumerate()
-            .filter_map(|(idx, column_name)| {
-                // checking if the current column has been selected
-                if let Some(col_idx) = match sheet.selected_columns {
-                    // All columns selected, return the current index
-                    SelectedColumns::All => Some(idx),
-                    // Otherwise, return its index. If None is found, it means the column was not
-                    // selected, and we will just continue
-                    _ => sheet.selected_columns.idx_for_column(
-                        &sheet.available_columns,
-                        column_name,
-                        idx,
-                    ),
-                } {
-                    // At this point, we know for sure that the column is in the schema so we can
-                    // safely unwrap
-                    let field = schema.field_with_name(column_name).unwrap();
-                    Some((
-                        field.name(),
-                        match field.data_type() {
-                            ArrowDataType::Boolean => {
-                                create_boolean_array(sheet.data(), col_idx, offset, limit)
-                            }
-                            ArrowDataType::Int64 => {
-                                create_int_array(sheet.data(), col_idx, offset, limit)
-                            }
-                            ArrowDataType::Float64 => {
-                                create_float_array(sheet.data(), col_idx, offset, limit)
-                            }
-                            ArrowDataType::Utf8 => {
-                                create_string_array(sheet.data(), col_idx, offset, limit)
-                            }
-                            ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => {
-                                create_datetime_array(sheet.data(), col_idx, offset, limit)
-                            }
-                            ArrowDataType::Date32 => {
-                                create_date_array(sheet.data(), col_idx, offset, limit)
-                            }
-                            ArrowDataType::Duration(TimeUnit::Millisecond) => {
-                                create_duration_array(sheet.data(), col_idx, offset, limit)
-                            }
-                            ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)),
-                            _ => unreachable!(),
-                        },
-                    ))
-                } else {
-                    None
-                }
+            .map(|column_info| {
+                // At this point, we know for sure that the column is in the schema so we can
+                // safely unwrap
+                (
+                    column_info.name(),
+                    match column_info.dtype() {
+                        DType::Bool => {
+                            create_boolean_array(sheet.data(), column_info.index(), offset, limit)
+                        }
+                        DType::Int => {
+                            create_int_array(sheet.data(), column_info.index(), offset, limit)
+                        }
+                        DType::Float => {
+                            create_float_array(sheet.data(), column_info.index(), offset, limit)
+                        }
+                        DType::String => {
+                            create_string_array(sheet.data(), column_info.index(), offset, limit)
+                        }
+                        DType::DateTime => {
+                            create_datetime_array(sheet.data(), column_info.index(), offset, limit)
+                        }
+                        DType::Date => {
+                            create_date_array(sheet.data(), column_info.index(), offset, limit)
+                        }
+                        DType::Duration => {
+                            create_duration_array(sheet.data(), column_info.index(), offset, limit)
+                        }
+                        DType::Null => Arc::new(NullArray::new(limit - offset)),
+                    },
+                )
             })
             .peekable();
 
         // If the iterable is empty, try_from_iter returns an Err
         if iter.peek().is_none() {
+            let schema: Schema = sheet.into();
             Ok(RecordBatch::new_empty(Arc::new(schema)))
         } else {
             RecordBatch::try_from_iter(iter)
@@ -663,13 +651,13 @@ impl ExcelSheet {
     }
 
     #[getter]
-    pub fn selected_columns<'p>(&'p self, py: Python<'p>) -> Option<&PyList> {
-        self.selected_columns.to_python(py)
+    pub fn selected_columns<'p>(&'p self, _py: Python<'p>) -> Vec<ColumnInfo> {
+        self.selected_columns.clone()
     }
 
     #[getter]
-    pub fn available_columns<'p>(&'p self, py: Python<'p>) -> &PyList {
-        PyList::new(py, &self.available_columns)
+    pub fn available_columns<'p>(&'p self, _py: Python<'p>) -> Vec<ColumnInfo> {
+        self.available_columns.clone()
     }
 
     #[getter]
@@ -718,7 +706,7 @@ mod tests {
             let py_list = PyList::new(py, vec![0, 1, 2]).as_ref();
             assert_eq!(
                 TryInto::<SelectedColumns>::try_into(Some(py_list)).unwrap(),
-                SelectedColumns::ByIndex(vec![0, 1, 2])
+                SelectedColumns::Selection([0, 1, 2].into_iter().map(IdxOrName::Idx).collect())
             )
         });
     }
@@ -729,7 +717,31 @@ mod tests {
             let py_list = PyList::new(py, vec!["foo", "bar"]).as_ref();
             assert_eq!(
                 TryInto::<SelectedColumns>::try_into(Some(py_list)).unwrap(),
-                SelectedColumns::ByName(vec!["foo".to_string(), "bar".to_string()])
+                SelectedColumns::Selection(
+                    ["foo", "bar"]
+                        .iter()
+                        .map(ToString::to_string)
+                        .map(IdxOrName::Name)
+                        .collect()
+                )
+            )
+        });
+    }
+
+    #[test]
+    fn selected_columns_from_list_of_valid_strings_and_ints() {
+        Python::with_gil(|py| {
+            let py_list = PyList::new(py, vec!["foo", "bar"]);
+            py_list.append(42).unwrap();
+            py_list.append(5).unwrap();
+            assert_eq!(
+                TryInto::<SelectedColumns>::try_into(Some(py_list.as_ref())).unwrap(),
+                SelectedColumns::Selection(vec![
+                    IdxOrName::Name("foo".to_string()),
+                    IdxOrName::Name("bar".to_string()),
+                    IdxOrName::Idx(42),
+                    IdxOrName::Idx(5)
+                ])
             )
         });
     }
@@ -774,9 +786,11 @@ mod tests {
     // Ranges beyond Z
     #[case("A,y:AB", vec![0, 24, 25, 26, 27])]
     #[case("BB:BE,DDC:DDF", vec![53, 54, 55, 56, 2810, 2811, 2812, 2813])]
-    fn selected_columns_from_valid_ranges(#[case] raw: &str, #[case] expected: Vec<usize>) {
+    fn selected_columns_from_valid_ranges(#[case] raw: &str, #[case] expected_indices: Vec<usize>) {
         Python::with_gil(|py| {
-            let expected_range = SelectedColumns::ByIndex(expected);
+            let expected_range = SelectedColumns::Selection(
+                expected_indices.into_iter().map(IdxOrName::Idx).collect(),
+            );
             let input = PyString::new(py, raw).as_ref();
 
             let range = TryInto::<SelectedColumns>::try_into(Some(input))
diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs
deleted file mode 100644
index 0f7d42b..0000000
--- a/src/utils/arrow.rs
+++ /dev/null
@@ -1,270 +0,0 @@
-use std::{collections::HashSet, sync::OnceLock, usize};
-
-use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
-use calamine::{CellErrorType, Data as CalData, DataType, Range};
-
-use crate::{
-    error::{FastExcelErrorKind, FastExcelResult},
-    types::{dtype::DTypeMap, python::excelsheet::SelectedColumns},
-};
-
-/// All the possible string values that should be considered as NULL
-const NULL_STRING_VALUES: [&str; 19] = [
-    "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
-    "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null",
-];
-
-fn get_cell_type(data: &Range<CalData>, row: usize, col: usize) -> FastExcelResult<ArrowDataType> {
-    let cell = data
-        .get((row, col))
-        .ok_or_else(|| FastExcelErrorKind::CannotRetrieveCellData(row, col))?;
-
-    match cell {
-        CalData::Int(_) => Ok(ArrowDataType::Int64),
-        CalData::Float(_) => Ok(ArrowDataType::Float64),
-        CalData::String(v) => match v {
-            v if NULL_STRING_VALUES.contains(&v.as_str()) => Ok(ArrowDataType::Null),
-            _ => Ok(ArrowDataType::Utf8),
-        },
-        CalData::Bool(_) => Ok(ArrowDataType::Boolean),
-        // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be
-        // a duration or a datatime
-        CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() {
-            ArrowDataType::Timestamp(TimeUnit::Millisecond, None)
-        } else {
-            ArrowDataType::Duration(TimeUnit::Millisecond)
-        }),
-        // These types contain an ISO8601 representation of a date/datetime or a duration
-        CalData::DateTimeIso(_) => match cell.as_datetime() {
-            // If we cannot convert the cell to a datetime, we're working on a date
-            Some(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)),
-            // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime
-            // rather than a date
-            None => Ok(ArrowDataType::Date32),
-        },
-        // A simple duration
-        CalData::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)),
-        // Errors and nulls
-        CalData::Error(err) => match err {
-            CellErrorType::NA => Ok(ArrowDataType::Null),
-            _ => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()),
-        },
-        CalData::Empty => Ok(ArrowDataType::Null),
-    }
-}
-
-static FLOAT_TYPES_CELL: OnceLock<HashSet<ArrowDataType>> = OnceLock::new();
-static INT_TYPES_CELL: OnceLock<HashSet<ArrowDataType>> = OnceLock::new();
-static STRING_TYPES_CELL: OnceLock<HashSet<ArrowDataType>> = OnceLock::new();
-
-fn float_types() -> &'static HashSet<ArrowDataType> {
-    FLOAT_TYPES_CELL.get_or_init(|| {
-        HashSet::from([
-            ArrowDataType::Int64,
-            ArrowDataType::Float64,
-            ArrowDataType::Boolean,
-        ])
-    })
-}
-
-fn int_types() -> &'static HashSet<ArrowDataType> {
-    INT_TYPES_CELL.get_or_init(|| HashSet::from([ArrowDataType::Int64, ArrowDataType::Boolean]))
-}
-
-fn string_types() -> &'static HashSet<ArrowDataType> {
-    STRING_TYPES_CELL.get_or_init(|| {
-        HashSet::from([
-            ArrowDataType::Int64,
-            ArrowDataType::Float64,
-            ArrowDataType::Utf8,
-        ])
-    })
-}
-
-fn get_arrow_column_type(
-    data: &Range<CalData>,
-    start_row: usize,
-    end_row: usize,
-    col: usize,
-) -> FastExcelResult<ArrowDataType> {
-    let mut column_types = (start_row..end_row)
-        .map(|row| get_cell_type(data, row, col))
-        .collect::<FastExcelResult<HashSet<_>>>()?;
-
-    // All columns are nullable anyway so we're not taking Null into account here
-    column_types.remove(&ArrowDataType::Null);
-
-    if column_types.is_empty() {
-        // If no type apart from NULL was found, it's a NULL column
-        Ok(ArrowDataType::Null)
-    } else if column_types.len() == 1 {
-        // If a single non-null type was found, return it
-        Ok(column_types.into_iter().next().unwrap())
-    } else if column_types.is_subset(int_types()) {
-        // If every cell in the column can be converted to an int, return int64
-        Ok(ArrowDataType::Int64)
-    } else if column_types.is_subset(float_types()) {
-        // If every cell in the column can be converted to a float, return Float64
-        Ok(ArrowDataType::Float64)
-    } else if column_types.is_subset(string_types()) {
-        // If every cell in the column can be converted to a string, return Utf8
-        Ok(ArrowDataType::Utf8)
-    } else {
-        // NOTE: Not being too smart about multi-types columns for now
-        Err(
-            FastExcelErrorKind::UnsupportedColumnTypeCombination(format!("{column_types:?}"))
-                .into(),
-        )
-    }
-}
-
-pub(crate) fn alias_for_name(name: &str, existing_names: &[String]) -> String {
-    fn rec(name: &str, existing_names: &[String], depth: usize) -> String {
-        let alias = if depth == 0 {
-            name.to_owned()
-        } else {
-            format!("{name}_{depth}")
-        };
-        match existing_names
-            .iter()
-            .any(|existing_name| existing_name == &alias)
-        {
-            true => rec(name, existing_names, depth + 1),
-            false => alias,
-        }
-    }
-
-    rec(name, existing_names, 0)
-}
-
-pub(crate) fn arrow_schema_from_column_names_and_range(
-    range: &Range<CalData>,
-    column_names: &[String],
-    row_idx: usize,
-    row_limit: usize,
-    selected_columns: &SelectedColumns,
-    dtypes: Option<&DTypeMap>,
-) -> FastExcelResult<Schema> {
-    // clippy suggests to split this type annotation into type declaration, but that would make it
-    // less clear IMO
-    #[allow(clippy::type_complexity)]
-    let arrow_type_for_column: Box<dyn Fn(usize, &String) -> FastExcelResult<ArrowDataType>> =
-        match selected_columns {
-            // In case all columns are selected, we look up the dtype for the column by name,
-            // fallback on a lookup by index, and finally on get_arrow_column_type
-            SelectedColumns::All => Box::new(|col_idx, col_name| match dtypes {
-                None => get_arrow_column_type(range, row_idx, row_limit, col_idx),
-                Some(dts) => {
-                    if let Some(dtype_by_name) = dts.dtype_for_col_name(col_name) {
-                        Ok(dtype_by_name.into())
-                    } else if let Some(dtype_by_idx) = dts.dtype_for_col_idx(col_idx) {
-                        Ok(dtype_by_idx.into())
-                    } else {
-                        get_arrow_column_type(range, row_idx, row_limit, col_idx)
-                    }
-                }
-            }),
-            // If columns are selected by name, look up the dtype by name and fallback on
-            // get_arrow_column_type
-            SelectedColumns::ByName(_) => Box::new(|col_idx, col_name| {
-                dtypes
-                    .and_then(|dtypes| dtypes.dtype_for_col_name(col_name))
-                    .map(|dtype| Ok(dtype.into()))
-                    .unwrap_or_else(|| get_arrow_column_type(range, row_idx, row_limit, col_idx))
-            }),
-
-            // If columns are selected by index, look up the dtype by name and fallback on
-            // get_arrow_column_type
-            SelectedColumns::ByIndex(_) => Box::new(|col_idx, _col_name| {
-                dtypes
-                    .and_then(|dtypes| dtypes.dtype_for_col_idx(col_idx))
-                    .map(|dtype| Ok(dtype.into()))
-                    .unwrap_or_else(|| get_arrow_column_type(range, row_idx, row_limit, col_idx))
-            }),
-        };
-
-    let mut fields = Vec::with_capacity(column_names.len());
-    let mut existing_names = Vec::with_capacity(column_names.len());
-
-    for (idx, name) in column_names.iter().enumerate() {
-        // If we have an index for the given column, extract it and add it to the schema. Otherwise,
-        // just ignore it
-        if let Some(col_idx) = match selected_columns {
-            SelectedColumns::All => Some(idx),
-            _ => selected_columns.idx_for_column(column_names, name, idx),
-        } {
-            let col_type = arrow_type_for_column(col_idx, name)?;
-            let aliased_name = alias_for_name(name, &existing_names);
-            fields.push(Field::new(&aliased_name, col_type, true));
-            existing_names.push(aliased_name);
-        }
-    }
-
-    Ok(Schema::new(fields))
-}
-
-#[cfg(test)]
-mod tests {
-    use calamine::Cell;
-    use rstest::{fixture, rstest};
-
-    use super::*;
-
-    #[fixture]
-    fn range() -> Range<CalData> {
-        Range::from_sparse(vec![
-            // First column
-            Cell::new((0, 0), CalData::Bool(true)),
-            Cell::new((1, 0), CalData::Bool(false)),
-            Cell::new((2, 0), CalData::String("NULL".to_string())),
-            Cell::new((3, 0), CalData::Int(42)),
-            Cell::new((4, 0), CalData::Float(13.37)),
-            Cell::new((5, 0), CalData::String("hello".to_string())),
-            Cell::new((6, 0), CalData::Empty),
-            Cell::new((7, 0), CalData::String("#N/A".to_string())),
-            Cell::new((8, 0), CalData::Int(12)),
-            Cell::new((9, 0), CalData::Float(12.21)),
-            Cell::new((10, 0), CalData::Bool(true)),
-            Cell::new((11, 0), CalData::Int(1337)),
-        ])
-    }
-
-    #[rstest]
-    // pure bool
-    #[case(0, 2, ArrowDataType::Boolean)]
-    // pure int
-    #[case(3, 4, ArrowDataType::Int64)]
-    // pure float
-    #[case(4, 5, ArrowDataType::Float64)]
-    // pure string
-    #[case(5, 6, ArrowDataType::Utf8)]
-    // pure int + float
-    #[case(3, 5, ArrowDataType::Float64)]
-    // null + int + float
-    #[case(2, 5, ArrowDataType::Float64)]
-    // float + string
-    #[case(4, 6, ArrowDataType::Utf8)]
-    // int + float + string
-    #[case(3, 6, ArrowDataType::Utf8)]
-    // null + int + float + string + empty + null
-    #[case(2, 8, ArrowDataType::Utf8)]
-    // empty + null + int
-    #[case(6, 9, ArrowDataType::Int64)]
-    // int + float + null
-    #[case(7, 10, ArrowDataType::Float64)]
-    // int + float + bool + null
-    #[case(7, 11, ArrowDataType::Float64)]
-    // int + bool
-    #[case(10, 12, ArrowDataType::Int64)]
-    fn get_arrow_column_type_multi_dtype_ok(
-        range: Range<CalData>,
-        #[case] start_row: usize,
-        #[case] end_row: usize,
-        #[case] expected: ArrowDataType,
-    ) {
-        assert_eq!(
-            get_arrow_column_type(&range, start_row, end_row, 0).unwrap(),
-            expected
-        );
-    }
-}
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
deleted file mode 100644
index 0e9f34f..0000000
--- a/src/utils/mod.rs
+++ /dev/null
@@ -1 +0,0 @@
-pub(crate) mod arrow;