diff --git a/src/datanomy/cli.py b/src/datanomy/cli.py index 8c99de4..04a42af 100644 --- a/src/datanomy/cli.py +++ b/src/datanomy/cli.py @@ -19,10 +19,6 @@ def main(file: Path) -> None: ---------- file: Path to a Parquet file to inspect """ - if not file.suffix.lower() == ".parquet": - click.echo(f"Error: {file} does not appear to be a Parquet file", err=True) - sys.exit(1) - try: reader = ParquetReader(file) app = DatanomyApp(reader) diff --git a/src/datanomy/reader/parquet.py b/src/datanomy/reader/parquet.py index c992b94..6b99c55 100644 --- a/src/datanomy/reader/parquet.py +++ b/src/datanomy/reader/parquet.py @@ -4,6 +4,7 @@ from typing import Any import pyarrow.parquet as pq +from pyarrow.lib import ArrowInvalid class ParquetReader: @@ -16,9 +17,22 @@ def __init__(self, file_path: Path) -> None: Parameters ---------- file_path: Path to the Parquet file - """ - self.file_path = file_path - self.parquet_file = pq.ParquetFile(file_path) + + Raises + ------ + FileNotFoundError: If the file does not exist + ArrowInvalid: If the file is not a valid Parquet file + """ + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + try: + self.file_path = file_path + self.parquet_file = pq.ParquetFile(file_path) + except ArrowInvalid as e: + raise ArrowInvalid( + f"{file_path} does not appear to be a Parquet file" + ) from e @property def schema_arrow(self) -> Any: diff --git a/tests/conftest.py b/tests/conftest.py index c902dc3..cc6c8a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -106,3 +106,33 @@ def large_schema_parquet(tmp_path: Path) -> Path: file_path = tmp_path / "large_schema.parquet" pq.write_table(table, file_path) return file_path + + +@pytest.fixture +def parquet_without_extension(tmp_path: Path) -> Path: + """Create a valid Parquet file without .parquet extension. + + Returns: + Path to the created Parquet file + """ + table = pa.table( + { + "id": [1, 2, 3], + "name": ["a", "b", "c"], + } + ) + file_path = tmp_path / "data_file" + pq.write_table(table, file_path) + return file_path + + +@pytest.fixture +def invalid_parquet_file(tmp_path: Path) -> Path: + """Create a file with invalid Parquet content. + + Returns: + Path to the created invalid file + """ + file_path = tmp_path / "not_a_parquet.dat" + file_path.write_text("This is not a Parquet file") + return file_path diff --git a/tests/test_cli.py b/tests/test_cli.py index d50dfe0..cc18fa4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,18 +8,6 @@ from datanomy.cli import main -def test_cli_rejects_non_parquet_extension(tmp_path: Path) -> None: - """Test that CLI rejects files without .parquet extension.""" - bad_file = tmp_path / "test.txt" - bad_file.write_text("not parquet") - - runner = CliRunner() - result = runner.invoke(main, [str(bad_file)]) - - assert result.exit_code == 1 - assert "does not appear to be a Parquet file" in result.output - - def test_cli_rejects_nonexistent_file() -> None: """Test that CLI rejects files that don't exist.""" runner = CliRunner() @@ -60,19 +48,13 @@ def test_cli_creates_reader( mock_reader.assert_called_once_with(simple_parquet) -def test_cli_case_insensitive_extension(tmp_path: Path) -> None: - """Test that CLI accepts .PARQUET extension (case insensitive).""" - # Create a valid parquet file with uppercase extension - import pyarrow as pa - import pyarrow.parquet as pq - - file_path = tmp_path / "test.PARQUET" - table = pa.table({"id": [1, 2, 3]}) - pq.write_table(table, file_path) - +def test_cli_accepts_parquet_without_extension( + parquet_without_extension: Path, +) -> None: + """Test that CLI accepts valid Parquet files regardless of extension.""" with patch("datanomy.cli.DatanomyApp"): runner = CliRunner() - result = runner.invoke(main, [str(file_path)]) + result = runner.invoke(main, [str(parquet_without_extension)]) - # Should accept uppercase extension + # Should accept file based on content, not extension assert result.exit_code == 0 diff --git a/tests/test_reader.py b/tests/test_reader.py index ef181fe..88a3db4 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -2,6 +2,9 @@ from pathlib import Path +import pytest +from pyarrow.lib import ArrowInvalid + from datanomy.reader.parquet import ParquetReader @@ -59,3 +62,27 @@ def test_reader_large_schema(large_schema_parquet: Path) -> None: field_names = [field.name for field in reader.schema_arrow] for i in range(50): assert f"col_{i}" in field_names + + +def test_reader_nonexistent_file(tmp_path: Path) -> None: + """Test that ParquetReader raises FileNotFoundError for nonexistent files.""" + nonexistent = tmp_path / "nonexistent.parquet" + + with pytest.raises(FileNotFoundError, match="File not found"): + ParquetReader(nonexistent) + + +def test_reader_invalid_parquet_file(invalid_parquet_file: Path) -> None: + """Test that ParquetReader raises ArrowInvalid for non-Parquet files.""" + with pytest.raises(ArrowInvalid, match="does not appear to be a Parquet file"): + ParquetReader(invalid_parquet_file) + + +def test_reader_accepts_file_without_parquet_extension( + parquet_without_extension: Path, +) -> None: + """Test that ParquetReader accepts valid Parquet files regardless of extension.""" + # Should successfully read the file + reader = ParquetReader(parquet_without_extension) + assert reader.num_rows == 3 + assert len(reader.schema_arrow) == 2