chore: increase pyarrow pin to >=16

delta-io · Aug 7, 2024 · e04661d · e04661d
1 parent eeba882
commit e04661d
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 104 deletions.
diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml
@@ -30,40 +30,6 @@ jobs:
       - name: Check Rust
         run: make check-rust
 
-  test-minimal:
-    name: Python Build (Python 3.8 PyArrow 8.0.0)
-    runs-on: ubuntu-latest
-    env:
-      RUSTFLAGS: "-C debuginfo=line-tables-only"
-      CARGO_INCREMENTAL: 0
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Setup Environment
-        uses: ./.github/actions/setup-env
-        with:
-          python-version: 3.8
-
-      - name: Build and install deltalake
-        run: |
-          python -m venv venv
-          source venv/bin/activate
-          make setup
-          # Install minimum PyArrow version
-          pip install -e .[pandas,devel] pyarrow==8.0.0
-        env:
-          RUSTFLAGS: "-C debuginfo=line-tables-only"
-
-      - name: Run tests
-        run: |
-          source venv/bin/activate
-          make unit-test
-
-    # - name: Run Integration tests
-    #   run: |
-    #     py.test --cov tests -m integration
-
   test:
     name: Python Build (Python 3.10 PyArrow latest)
     runs-on: ubuntu-latest

diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py
@@ -64,7 +64,6 @@
 else:
     _has_pandas = True
 
-PYARROW_MAJOR_VERSION = int(pa.__version__.split(".", maxsplit=1)[0])
 DEFAULT_DATA_SKIPPING_NUM_INDEX_COLS = 32
 
 DTYPE_MAP = {
@@ -394,19 +393,9 @@ def write_deltalake(
 
         if partition_by:
             table_schema: pa.Schema = schema
-            if PYARROW_MAJOR_VERSION < 12:
-                partition_schema = pa.schema(
-                    [
-                        pa.field(
-                            name, _large_to_normal_dtype(table_schema.field(name).type)
-                        )
-                        for name in partition_by
-                    ]
-                )
-            else:
-                partition_schema = pa.schema(
-                    [table_schema.field(name) for name in partition_by]
-                )
+            partition_schema = pa.schema(
+                [table_schema.field(name) for name in partition_by]
+            )
             partitioning = ds.partitioning(partition_schema, flavor="hive")
         else:
             partitioning = None
@@ -421,18 +410,10 @@ def visitor(written_file: Any) -> None:
                 columns_to_collect_stats=stats_cols,
             )
 
-            # PyArrow added support for written_file.size in 9.0.0
-            if PYARROW_MAJOR_VERSION >= 9:
-                size = written_file.size
-            elif filesystem is not None:
-                size = filesystem.get_file_info([path])[0].size
-            else:
-                size = 0
-
             add_actions.append(
                 AddAction(
                     path,
-                    size,
+                    written_file.size,
                     partition_values,
                     int(datetime.now().timestamp() * 1000),
                     True,
@@ -847,19 +828,6 @@ def iter_groups(metadata: Any) -> Iterator[Any]:
                 # Min and Max are recorded in physical type, not logical type
                 # https://stackoverflow.com/questions/66753485/decoding-parquet-min-max-statistics-for-decimal-type
                 # TODO: Add logic to decode physical type for DATE, DECIMAL
-                logical_type = (
-                    metadata.row_group(0)
-                    .column(column_idx)
-                    .statistics.logical_type.type
-                )
-
-                if PYARROW_MAJOR_VERSION < 8 and logical_type not in [
-                    "STRING",
-                    "INT",
-                    "TIMESTAMP",
-                    "NONE",
-                ]:
-                    continue
 
                 minimums = (
                     group.column(column_idx).statistics.min

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -18,7 +18,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12"
 ]
 dependencies = [
-    "pyarrow>=8",
+    "pyarrow>=16",
     "pyarrow-hotfix",    
 ]
 

diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
@@ -223,8 +223,8 @@ def test_delta_schema():
     assert schema_without_metadata == Schema.from_pyarrow(pa_schema)
 
 
-def _pyarrow_view_tuples():
-    return [
+def _generate_test_tuples():
+    test_tuples = [
         (
             pa.schema([("some_int", pa.uint32()), ("some_string", pa.string_view())]),
             pa.schema([("some_int", pa.int32()), ("some_string", pa.string_view())]),
@@ -245,11 +245,6 @@ def _pyarrow_view_tuples():
             ),
             ArrowSchemaConversionMode.PASSTHROUGH,
         ),
-    ]
-
-
-def _generate_test_tuples():
-    test_tuples = [
         (
             pa.schema([("some_int", pa.uint32()), ("some_string", pa.string())]),
             pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]),
@@ -522,11 +517,7 @@ def _generate_test_tuples():
         ),
     ]
 
-    if int(pa.__version__.split(".")[0]) < 16:
-        return test_tuples
-    else:
-        test_tuples.extend(_pyarrow_view_tuples())
-        return test_tuples
+    return test_tuples
 
 
 @pytest.mark.parametrize(

diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py
@@ -6,8 +6,6 @@
 from typing import Any, List, Tuple
 from unittest.mock import Mock
 
-from packaging import version
-
 from deltalake._util import encode_partition_value
 from deltalake.exceptions import DeltaProtocolError
 from deltalake.table import ProtocolVersions
@@ -280,13 +278,11 @@ def test_read_table_with_stats():
     data = dataset.to_table(filter=filter_expr)
     assert data.num_rows == 0
 
-    # PyArrow added support for is_null and is_valid simplification in 8.0.0
-    if version.parse(pa.__version__).major >= 8:
-        filter_expr = ds.field("cases").is_null()
-        assert len(list(dataset.get_fragments(filter=filter_expr))) == 0
+    filter_expr = ds.field("cases").is_null()
+    assert len(list(dataset.get_fragments(filter=filter_expr))) == 0
 
-        data = dataset.to_table(filter=filter_expr)
-        assert data.num_rows == 0
+    data = dataset.to_table(filter=filter_expr)
+    assert data.num_rows == 0
 
 
 def test_read_special_partition():

diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py
@@ -13,7 +13,6 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 import pytest
-from packaging import version
 from pyarrow.dataset import ParquetFileFormat, ParquetReadOptions, dataset
 from pyarrow.lib import RecordBatchReader
 
@@ -710,9 +709,8 @@ def test_writer_stats(existing_table: DeltaTable, sample_data: pa.Table):
         },
     }
     # PyArrow added support for decimal and date32 in 8.0.0
-    if version.parse(pa.__version__).major >= 8:
-        expected_mins["decimal"] = 10.0
-        expected_mins["date32"] = "2022-01-01"
+    expected_mins["decimal"] = 10.0
+    expected_mins["date32"] = "2022-01-01"
 
     assert stats["minValues"] == expected_mins
 
@@ -730,9 +728,8 @@ def test_writer_stats(existing_table: DeltaTable, sample_data: pa.Table):
         "struct": {"x": 4, "y": "4"},
     }
     # PyArrow added support for decimal and date32 in 8.0.0
-    if version.parse(pa.__version__).major >= 8:
-        expected_maxs["decimal"] = 14.0
-        expected_maxs["date32"] = "2022-01-05"
+    expected_maxs["decimal"] = 14.0
+    expected_maxs["date32"] = "2022-01-05"
 
     assert stats["maxValues"] == expected_maxs
 
@@ -1323,9 +1320,6 @@ def test_large_arrow_types(tmp_path: pathlib.Path):
     assert table.schema == dt.schema().to_pyarrow(as_large_types=True)
 
 
-@pytest.mark.skipif(
-    int(pa.__version__.split(".")[0]) < 10, reason="map casts require pyarrow >= 10"
-)
 def test_large_arrow_types_dataset_as_large_types(tmp_path: pathlib.Path):
     pylist = [
         {"name": "Joey", "gender": b"M", "arr_type": ["x", "y"], "dict": {"a": b"M"}},
@@ -1351,9 +1345,6 @@ def test_large_arrow_types_dataset_as_large_types(tmp_path: pathlib.Path):
     assert union_ds.to_table().shape[0] == 4
 
 
-@pytest.mark.skipif(
-    int(pa.__version__.split(".")[0]) < 10, reason="map casts require pyarrow >= 10"
-)
 def test_large_arrow_types_explicit_scan_schema(tmp_path: pathlib.Path):
     pylist = [
         {"name": "Joey", "gender": b"M", "arr_type": ["x", "y"], "dict": {"a": b"M"}},