Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions src/datanomy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@ def main(file: Path) -> None:
----------
file: Path to a Parquet file to inspect
"""
if not file.suffix.lower() == ".parquet":
click.echo(f"Error: {file} does not appear to be a Parquet file", err=True)
sys.exit(1)

try:
reader = ParquetReader(file)
app = DatanomyApp(reader)
Expand Down
20 changes: 17 additions & 3 deletions src/datanomy/reader/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any

import pyarrow.parquet as pq
from pyarrow.lib import ArrowInvalid


class ParquetReader:
Expand All @@ -16,9 +17,22 @@ def __init__(self, file_path: Path) -> None:
Parameters
----------
file_path: Path to the Parquet file
"""
self.file_path = file_path
self.parquet_file = pq.ParquetFile(file_path)

Raises
------
FileNotFoundError: If the file does not exist
ArrowInvalid: If the file is not a valid Parquet file
"""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")

try:
self.file_path = file_path
self.parquet_file = pq.ParquetFile(file_path)
except ArrowInvalid as e:
raise ArrowInvalid(
f"{file_path} does not appear to be a Parquet file"
) from e

@property
def schema_arrow(self) -> Any:
Expand Down
30 changes: 30 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,33 @@ def large_schema_parquet(tmp_path: Path) -> Path:
file_path = tmp_path / "large_schema.parquet"
pq.write_table(table, file_path)
return file_path


@pytest.fixture
def parquet_without_extension(tmp_path: Path) -> Path:
"""Create a valid Parquet file without .parquet extension.

Returns:
Path to the created Parquet file
"""
table = pa.table(
{
"id": [1, 2, 3],
"name": ["a", "b", "c"],
}
)
file_path = tmp_path / "data_file"
pq.write_table(table, file_path)
return file_path


@pytest.fixture
def invalid_parquet_file(tmp_path: Path) -> Path:
"""Create a file with invalid Parquet content.

Returns:
Path to the created invalid file
"""
file_path = tmp_path / "not_a_parquet.dat"
file_path.write_text("This is not a Parquet file")
return file_path
30 changes: 6 additions & 24 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,6 @@
from datanomy.cli import main


def test_cli_rejects_non_parquet_extension(tmp_path: Path) -> None:
"""Test that CLI rejects files without .parquet extension."""
bad_file = tmp_path / "test.txt"
bad_file.write_text("not parquet")

runner = CliRunner()
result = runner.invoke(main, [str(bad_file)])

assert result.exit_code == 1
assert "does not appear to be a Parquet file" in result.output


def test_cli_rejects_nonexistent_file() -> None:
"""Test that CLI rejects files that don't exist."""
runner = CliRunner()
Expand Down Expand Up @@ -60,19 +48,13 @@ def test_cli_creates_reader(
mock_reader.assert_called_once_with(simple_parquet)


def test_cli_case_insensitive_extension(tmp_path: Path) -> None:
"""Test that CLI accepts .PARQUET extension (case insensitive)."""
# Create a valid parquet file with uppercase extension
import pyarrow as pa
import pyarrow.parquet as pq

file_path = tmp_path / "test.PARQUET"
table = pa.table({"id": [1, 2, 3]})
pq.write_table(table, file_path)

def test_cli_accepts_parquet_without_extension(
parquet_without_extension: Path,
) -> None:
"""Test that CLI accepts valid Parquet files regardless of extension."""
with patch("datanomy.cli.DatanomyApp"):
runner = CliRunner()
result = runner.invoke(main, [str(file_path)])
result = runner.invoke(main, [str(parquet_without_extension)])

# Should accept uppercase extension
# Should accept file based on content, not extension
assert result.exit_code == 0
27 changes: 27 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from pathlib import Path

import pytest
from pyarrow.lib import ArrowInvalid

from datanomy.reader.parquet import ParquetReader


Expand Down Expand Up @@ -59,3 +62,27 @@ def test_reader_large_schema(large_schema_parquet: Path) -> None:
field_names = [field.name for field in reader.schema_arrow]
for i in range(50):
assert f"col_{i}" in field_names


def test_reader_nonexistent_file(tmp_path: Path) -> None:
"""Test that ParquetReader raises FileNotFoundError for nonexistent files."""
nonexistent = tmp_path / "nonexistent.parquet"

with pytest.raises(FileNotFoundError, match="File not found"):
ParquetReader(nonexistent)


def test_reader_invalid_parquet_file(invalid_parquet_file: Path) -> None:
"""Test that ParquetReader raises ArrowInvalid for non-Parquet files."""
with pytest.raises(ArrowInvalid, match="does not appear to be a Parquet file"):
ParquetReader(invalid_parquet_file)


def test_reader_accepts_file_without_parquet_extension(
parquet_without_extension: Path,
) -> None:
"""Test that ParquetReader accepts valid Parquet files regardless of extension."""
# Should successfully read the file
reader = ParquetReader(parquet_without_extension)
assert reader.num_rows == 3
assert len(reader.schema_arrow) == 2