Skip to content

Commit

Permalink
chore: increase pyarrow pin to >=16
Browse files Browse the repository at this point in the history
  • Loading branch information
ion-elgreco committed Aug 7, 2024
1 parent eeba882 commit e04661d
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 104 deletions.
34 changes: 0 additions & 34 deletions .github/workflows/python_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,40 +30,6 @@ jobs:
- name: Check Rust
run: make check-rust

test-minimal:
name: Python Build (Python 3.8 PyArrow 8.0.0)
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-C debuginfo=line-tables-only"
CARGO_INCREMENTAL: 0

steps:
- uses: actions/checkout@v3

- name: Setup Environment
uses: ./.github/actions/setup-env
with:
python-version: 3.8

- name: Build and install deltalake
run: |
python -m venv venv
source venv/bin/activate
make setup
# Install minimum PyArrow version
pip install -e .[pandas,devel] pyarrow==8.0.0
env:
RUSTFLAGS: "-C debuginfo=line-tables-only"

- name: Run tests
run: |
source venv/bin/activate
make unit-test
# - name: Run Integration tests
# run: |
# py.test --cov tests -m integration

test:
name: Python Build (Python 3.10 PyArrow latest)
runs-on: ubuntu-latest
Expand Down
40 changes: 4 additions & 36 deletions python/deltalake/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@
else:
_has_pandas = True

PYARROW_MAJOR_VERSION = int(pa.__version__.split(".", maxsplit=1)[0])
DEFAULT_DATA_SKIPPING_NUM_INDEX_COLS = 32

DTYPE_MAP = {
Expand Down Expand Up @@ -394,19 +393,9 @@ def write_deltalake(

if partition_by:
table_schema: pa.Schema = schema
if PYARROW_MAJOR_VERSION < 12:
partition_schema = pa.schema(
[
pa.field(
name, _large_to_normal_dtype(table_schema.field(name).type)
)
for name in partition_by
]
)
else:
partition_schema = pa.schema(
[table_schema.field(name) for name in partition_by]
)
partition_schema = pa.schema(
[table_schema.field(name) for name in partition_by]
)
partitioning = ds.partitioning(partition_schema, flavor="hive")
else:
partitioning = None
Expand All @@ -421,18 +410,10 @@ def visitor(written_file: Any) -> None:
columns_to_collect_stats=stats_cols,
)

# PyArrow added support for written_file.size in 9.0.0
if PYARROW_MAJOR_VERSION >= 9:
size = written_file.size
elif filesystem is not None:
size = filesystem.get_file_info([path])[0].size
else:
size = 0

add_actions.append(
AddAction(
path,
size,
written_file.size,
partition_values,
int(datetime.now().timestamp() * 1000),
True,
Expand Down Expand Up @@ -847,19 +828,6 @@ def iter_groups(metadata: Any) -> Iterator[Any]:
# Min and Max are recorded in physical type, not logical type
# https://stackoverflow.com/questions/66753485/decoding-parquet-min-max-statistics-for-decimal-type
# TODO: Add logic to decode physical type for DATE, DECIMAL
logical_type = (
metadata.row_group(0)
.column(column_idx)
.statistics.logical_type.type
)

if PYARROW_MAJOR_VERSION < 8 and logical_type not in [
"STRING",
"INT",
"TIMESTAMP",
"NONE",
]:
continue

minimums = (
group.column(column_idx).statistics.min
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ classifiers = [
"Programming Language :: Python :: 3.12"
]
dependencies = [
"pyarrow>=8",
"pyarrow>=16",
"pyarrow-hotfix",
]

Expand Down
15 changes: 3 additions & 12 deletions python/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,8 @@ def test_delta_schema():
assert schema_without_metadata == Schema.from_pyarrow(pa_schema)


def _pyarrow_view_tuples():
return [
def _generate_test_tuples():
test_tuples = [
(
pa.schema([("some_int", pa.uint32()), ("some_string", pa.string_view())]),
pa.schema([("some_int", pa.int32()), ("some_string", pa.string_view())]),
Expand All @@ -245,11 +245,6 @@ def _pyarrow_view_tuples():
),
ArrowSchemaConversionMode.PASSTHROUGH,
),
]


def _generate_test_tuples():
test_tuples = [
(
pa.schema([("some_int", pa.uint32()), ("some_string", pa.string())]),
pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]),
Expand Down Expand Up @@ -522,11 +517,7 @@ def _generate_test_tuples():
),
]

if int(pa.__version__.split(".")[0]) < 16:
return test_tuples
else:
test_tuples.extend(_pyarrow_view_tuples())
return test_tuples
return test_tuples


@pytest.mark.parametrize(
Expand Down
12 changes: 4 additions & 8 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from typing import Any, List, Tuple
from unittest.mock import Mock

from packaging import version

from deltalake._util import encode_partition_value
from deltalake.exceptions import DeltaProtocolError
from deltalake.table import ProtocolVersions
Expand Down Expand Up @@ -280,13 +278,11 @@ def test_read_table_with_stats():
data = dataset.to_table(filter=filter_expr)
assert data.num_rows == 0

# PyArrow added support for is_null and is_valid simplification in 8.0.0
if version.parse(pa.__version__).major >= 8:
filter_expr = ds.field("cases").is_null()
assert len(list(dataset.get_fragments(filter=filter_expr))) == 0
filter_expr = ds.field("cases").is_null()
assert len(list(dataset.get_fragments(filter=filter_expr))) == 0

data = dataset.to_table(filter=filter_expr)
assert data.num_rows == 0
data = dataset.to_table(filter=filter_expr)
assert data.num_rows == 0


def test_read_special_partition():
Expand Down
17 changes: 4 additions & 13 deletions python/tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from packaging import version
from pyarrow.dataset import ParquetFileFormat, ParquetReadOptions, dataset
from pyarrow.lib import RecordBatchReader

Expand Down Expand Up @@ -710,9 +709,8 @@ def test_writer_stats(existing_table: DeltaTable, sample_data: pa.Table):
},
}
# PyArrow added support for decimal and date32 in 8.0.0
if version.parse(pa.__version__).major >= 8:
expected_mins["decimal"] = 10.0
expected_mins["date32"] = "2022-01-01"
expected_mins["decimal"] = 10.0
expected_mins["date32"] = "2022-01-01"

assert stats["minValues"] == expected_mins

Expand All @@ -730,9 +728,8 @@ def test_writer_stats(existing_table: DeltaTable, sample_data: pa.Table):
"struct": {"x": 4, "y": "4"},
}
# PyArrow added support for decimal and date32 in 8.0.0
if version.parse(pa.__version__).major >= 8:
expected_maxs["decimal"] = 14.0
expected_maxs["date32"] = "2022-01-05"
expected_maxs["decimal"] = 14.0
expected_maxs["date32"] = "2022-01-05"

assert stats["maxValues"] == expected_maxs

Expand Down Expand Up @@ -1323,9 +1320,6 @@ def test_large_arrow_types(tmp_path: pathlib.Path):
assert table.schema == dt.schema().to_pyarrow(as_large_types=True)


@pytest.mark.skipif(
int(pa.__version__.split(".")[0]) < 10, reason="map casts require pyarrow >= 10"
)
def test_large_arrow_types_dataset_as_large_types(tmp_path: pathlib.Path):
pylist = [
{"name": "Joey", "gender": b"M", "arr_type": ["x", "y"], "dict": {"a": b"M"}},
Expand All @@ -1351,9 +1345,6 @@ def test_large_arrow_types_dataset_as_large_types(tmp_path: pathlib.Path):
assert union_ds.to_table().shape[0] == 4


@pytest.mark.skipif(
int(pa.__version__.split(".")[0]) < 10, reason="map casts require pyarrow >= 10"
)
def test_large_arrow_types_explicit_scan_schema(tmp_path: pathlib.Path):
pylist = [
{"name": "Joey", "gender": b"M", "arr_type": ["x", "y"], "dict": {"a": b"M"}},
Expand Down

0 comments on commit e04661d

Please sign in to comment.