From 7a145c5cf30a63c425913232c914c9869c5b146f Mon Sep 17 00:00:00 2001 From: jprandelli Date: Thu, 13 Nov 2025 22:25:58 +0100 Subject: [PATCH 1/5] Refactor ParquetReader initialization to improve error handling for file checks --- src/datanomy/reader/parquet.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/datanomy/reader/parquet.py b/src/datanomy/reader/parquet.py index c992b94..f5c5de3 100644 --- a/src/datanomy/reader/parquet.py +++ b/src/datanomy/reader/parquet.py @@ -4,6 +4,7 @@ from typing import Any import pyarrow.parquet as pq +from pyarrow.lib import ArrowInvalid class ParquetReader: @@ -16,9 +17,22 @@ def __init__(self, file_path: Path) -> None: Parameters ---------- file_path: Path to the Parquet file - """ - self.file_path = file_path - self.parquet_file = pq.ParquetFile(file_path) + + Raises + ------ + FileNotFoundError: If the file does not exist + ArrowInvalid: If the file is not a valid Parquet file + """ + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + try: + self.file_path = file_path + self.parquet_file = pq.ParquetFile(file_path) + except ArrowInvalid as e: + raise ArrowInvalid( + f"{file_path} does not appear to be a Parquet file" + ) from e @property def schema_arrow(self) -> Any: From 0caae4ec4ece463ec6e5182f3b9c355f83d0ed70 Mon Sep 17 00:00:00 2001 From: jprandelli Date: Thu, 13 Nov 2025 22:26:06 +0100 Subject: [PATCH 2/5] Remove Parquet file extension check from CLI --- src/datanomy/cli.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/datanomy/cli.py b/src/datanomy/cli.py index 8c99de4..04a42af 100644 --- a/src/datanomy/cli.py +++ b/src/datanomy/cli.py @@ -19,10 +19,6 @@ def main(file: Path) -> None: ---------- file: Path to a Parquet file to inspect """ - if not file.suffix.lower() == ".parquet": - click.echo(f"Error: {file} does not appear to be a Parquet file", err=True) - sys.exit(1) - try: reader = ParquetReader(file) app = DatanomyApp(reader) From 35f25ce979b72544b4d17dc7f05c5f61374384c5 Mon Sep 17 00:00:00 2001 From: jprandelli Date: Thu, 13 Nov 2025 22:31:51 +0100 Subject: [PATCH 3/5] Add tests for ParquetReader to handle files without .parquet extension, invalid parquet files and non existent files --- tests/conftest.py | 30 ++++++++++++++++++++++++++++++ tests/test_reader.py | 27 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index c902dc3..cc6c8a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -106,3 +106,33 @@ def large_schema_parquet(tmp_path: Path) -> Path: file_path = tmp_path / "large_schema.parquet" pq.write_table(table, file_path) return file_path + + +@pytest.fixture +def parquet_without_extension(tmp_path: Path) -> Path: + """Create a valid Parquet file without .parquet extension. + + Returns: + Path to the created Parquet file + """ + table = pa.table( + { + "id": [1, 2, 3], + "name": ["a", "b", "c"], + } + ) + file_path = tmp_path / "data_file" + pq.write_table(table, file_path) + return file_path + + +@pytest.fixture +def invalid_parquet_file(tmp_path: Path) -> Path: + """Create a file with invalid Parquet content. + + Returns: + Path to the created invalid file + """ + file_path = tmp_path / "not_a_parquet.dat" + file_path.write_text("This is not a Parquet file") + return file_path diff --git a/tests/test_reader.py b/tests/test_reader.py index ef181fe..b0533c1 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -2,6 +2,9 @@ from pathlib import Path +import pytest +from pyarrow.lib import ArrowInvalid + from datanomy.reader.parquet import ParquetReader @@ -59,3 +62,27 @@ def test_reader_large_schema(large_schema_parquet: Path) -> None: field_names = [field.name for field in reader.schema_arrow] for i in range(50): assert f"col_{i}" in field_names + + +def test_reader_nonexistent_file(tmp_path: Path) -> None: + """Test that ParquetReader raises FileNotFoundError for nonexistent files.""" + nonexistent = tmp_path / "nonexistent.parquet" + + with pytest.raises(FileNotFoundError, match="File not found"): + ParquetReader(nonexistent) + + +def test_reader_invalid_parquet_file(invalid_parquet_file: Path) -> None: + """Test that ParquetReader raises ArrowInvalid for non-Parquet files.""" + with pytest.raises(ArrowInvalid, match="does not appear to be a Parquet file"): + ParquetReader(invalid_parquet_file) + + +def test_reader_accepts_file_without_parquet_extension( + parquet_without_extension: Path, +) -> None: + """Test that ParquetReader accepts valid Parquet files regardless of extension.""" + # Should successfully read the file + reader = ParquetReader(parquet_without_extension) + assert reader.num_rows == 3 + assert len(reader.schema_arrow) == 2 From f2541111f93383c262632c056a07b055fa34457d Mon Sep 17 00:00:00 2001 From: jprandelli Date: Thu, 13 Nov 2025 22:32:06 +0100 Subject: [PATCH 4/5] Remove tests for Parquet extension checks from CLI --- tests/test_cli.py | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index d50dfe0..cc18fa4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,18 +8,6 @@ from datanomy.cli import main -def test_cli_rejects_non_parquet_extension(tmp_path: Path) -> None: - """Test that CLI rejects files without .parquet extension.""" - bad_file = tmp_path / "test.txt" - bad_file.write_text("not parquet") - - runner = CliRunner() - result = runner.invoke(main, [str(bad_file)]) - - assert result.exit_code == 1 - assert "does not appear to be a Parquet file" in result.output - - def test_cli_rejects_nonexistent_file() -> None: """Test that CLI rejects files that don't exist.""" runner = CliRunner() @@ -60,19 +48,13 @@ def test_cli_creates_reader( mock_reader.assert_called_once_with(simple_parquet) -def test_cli_case_insensitive_extension(tmp_path: Path) -> None: - """Test that CLI accepts .PARQUET extension (case insensitive).""" - # Create a valid parquet file with uppercase extension - import pyarrow as pa - import pyarrow.parquet as pq - - file_path = tmp_path / "test.PARQUET" - table = pa.table({"id": [1, 2, 3]}) - pq.write_table(table, file_path) - +def test_cli_accepts_parquet_without_extension( + parquet_without_extension: Path, +) -> None: + """Test that CLI accepts valid Parquet files regardless of extension.""" with patch("datanomy.cli.DatanomyApp"): runner = CliRunner() - result = runner.invoke(main, [str(file_path)]) + result = runner.invoke(main, [str(parquet_without_extension)]) - # Should accept uppercase extension + # Should accept file based on content, not extension assert result.exit_code == 0 From 69fc000967991fd1eefeafdcfa1a1919354a9b48 Mon Sep 17 00:00:00 2001 From: jprandelli Date: Thu, 13 Nov 2025 22:49:32 +0100 Subject: [PATCH 5/5] Fix formatting issues in ParquetReader and test for nonexistent files --- src/datanomy/reader/parquet.py | 4 ++-- tests/test_reader.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datanomy/reader/parquet.py b/src/datanomy/reader/parquet.py index f5c5de3..6b99c55 100644 --- a/src/datanomy/reader/parquet.py +++ b/src/datanomy/reader/parquet.py @@ -17,7 +17,7 @@ def __init__(self, file_path: Path) -> None: Parameters ---------- file_path: Path to the Parquet file - + Raises ------ FileNotFoundError: If the file does not exist @@ -25,7 +25,7 @@ def __init__(self, file_path: Path) -> None: """ if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") - + try: self.file_path = file_path self.parquet_file = pq.ParquetFile(file_path) diff --git a/tests/test_reader.py b/tests/test_reader.py index b0533c1..88a3db4 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -67,7 +67,7 @@ def test_reader_large_schema(large_schema_parquet: Path) -> None: def test_reader_nonexistent_file(tmp_path: Path) -> None: """Test that ParquetReader raises FileNotFoundError for nonexistent files.""" nonexistent = tmp_path / "nonexistent.parquet" - + with pytest.raises(FileNotFoundError, match="File not found"): ParquetReader(nonexistent)