[Data] Bump min version of PyArrow from 6.0 to 9.0 (ray-project#47040)

Dropping Ray Data internal support for Arrow < 9.0.0 as of Ray 2.39. This is necessary in avoiding over-compensating for issues resolved in old Arrow versions, like the ones surfacing in ray-project#48266. Signed-off-by: JP-sDEV <jon.pablo80@gmail.com>
JP-sDEV · Nov 14, 2024 · fb30b01 · fb30b01
1 parent aa24210
commit fb30b01
Show file tree

Hide file tree

Showing 8 changed files with 53 additions and 47 deletions.
diff --git a/.buildkite/data.rayci.yml b/.buildkite/data.rayci.yml
@@ -4,8 +4,8 @@ depends_on:
   - oss-ci-base_ml
 steps:
   # builds
-  - name: data6build
-    wanda: ci/docker/data6.build.wanda.yaml
+  - name: data9build
+    wanda: ci/docker/data9.build.wanda.yaml
 
   - name: datalbuild
     wanda: ci/docker/datal.build.wanda.yaml
@@ -28,7 +28,7 @@ steps:
     wanda: ci/docker/datatfxbsl.build.wanda.yaml
 
   # tests
-  - label: ":database: data: arrow 6 tests"
+  - label: ":database: data: arrow v9 tests"
     tags: 
       - python
       - data
@@ -38,11 +38,11 @@ steps:
       - bazel run //ci/ray_ci:test_in_docker -- //python/ray/data/... //python/ray/air/... data 
         --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" 
         --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
-        --build-name data6build
+        --build-name data9build
         --except-tags data_integration,doctest
-    depends_on: data6build
+    depends_on: data9build
 
-  - label: ":database: data: arrow 17 tests"
+  - label: ":database: data: arrow v17 tests"
     tags: 
       - python
       - data
@@ -56,7 +56,7 @@ steps:
         --except-tags data_integration,doctest
     depends_on: datalbuild
 
-  - label: ":database: data: arrow 17 {{matrix.python}} tests ({{matrix.worker_id}})"
+  - label: ":database: data: arrow v17 {{matrix.python}} tests ({{matrix.worker_id}})"
     key: datal_python_tests
     if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
     tags: 

diff --git a/ci/docker/data6.build.wanda.yaml → ci/docker/data9.build.wanda.yaml b/ci/docker/data6.build.wanda.yaml → ci/docker/data9.build.wanda.yaml
@@ -1,4 +1,4 @@
-name: "data6build"
+name: "data9build"
 froms: ["cr.ray.io/rayproject/oss-ci-base_ml"]
 dockerfile: ci/docker/data.build.Dockerfile
 srcs:
@@ -10,6 +10,6 @@ srcs:
   - python/requirements/ml/data-requirements.txt
   - python/requirements/ml/data-test-requirements.txt
 build_args:
-  - ARROW_VERSION=6.*
+  - ARROW_VERSION=9.*
 tags:
-  - cr.ray.io/rayproject/data6build
+  - cr.ray.io/rayproject/data9build
diff --git a/ci/pipeline/determine_tests_to_run.py b/ci/pipeline/determine_tests_to_run.py
@@ -119,8 +119,8 @@ def get_commit_range():
                 or changed_file == "ci/docker/data.build.Dockerfile"
                 or changed_file == "ci/docker/data.build.wanda.yaml"
                 or changed_file == "ci/docker/datan.build.wanda.yaml"
-                or changed_file == "ci/docker/data6.build.wanda.yaml"
-                or changed_file == "ci/docker/data14.build.wanda.yaml"
+                or changed_file == "ci/docker/data9.build.wanda.yaml"
+                or changed_file == "ci/docker/datal.build.wanda.yaml"
             ):
                 RAY_CI_DATA_AFFECTED = 1
                 RAY_CI_ML_AFFECTED = 1

diff --git a/python/ray/data/tests/test_huggingface.py b/python/ray/data/tests/test_huggingface.py
@@ -13,25 +13,14 @@ def hf_dataset():
     return datasets.load_dataset("tweet_eval", "stance_climate")
 
 
-def _arrow_sort_values(table: pyarrow.lib.Table) -> pyarrow.lib.Table:
-    """
-    Sort an Arrow table by the values in the first column. Used for testing
-    compatibility with pyarrow 6 where `sort_by` does not exist. Inspired by:
-    https://stackoverflow.com/questions/70893521/how-to-sort-a-pyarrow-table
-    """
-    by = [table.schema.names[0]]  # grab first col_name
-    table_sorted_indexes = pyarrow.compute.bottom_k_unstable(
-        table, sort_keys=by, k=len(table)
-    )
-    table_sorted = table.take(table_sorted_indexes)
-    return table_sorted
+def hfds_assert_equals(hfds: datasets.Dataset, ds: Dataset):
+    hfds_table = hfds.data.table
+    ds_table = pyarrow.concat_tables([ray.get(tbl) for tbl in ds.to_arrow_refs()])
 
+    sorting = [(name, "descending") for name in hfds_table.column_names]
+    hfds_table = hfds_table.sort_by(sorting)
+    ds_table = ds_table.sort_by(sorting)
 
-def hfds_assert_equals(hfds: datasets.Dataset, ds: Dataset):
-    hfds_table = _arrow_sort_values(hfds.data.table)
-    ds_table = _arrow_sort_values(
-        pyarrow.concat_tables([ray.get(tbl) for tbl in ds.to_arrow_refs()])
-    )
     assert hfds_table.equals(ds_table)
 
 

diff --git a/python/ray/data/tests/test_iceberg.py b/python/ray/data/tests/test_iceberg.py
@@ -3,7 +3,6 @@
 
 import pyarrow as pa
 import pytest
-from packaging.version import Version
 from pkg_resources import parse_version
 from pyiceberg import catalog as pyi_catalog
 from pyiceberg import expressions as pyi_expr
@@ -101,8 +100,8 @@ def pyiceberg_table():
 
 
 @pytest.mark.skipif(
-    Version(pa.__version__) < Version("9.0.0"),
-    reason="PyIceberg depends on pyarrow>=9.0.0",
+    parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
+    reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
 )
 def test_get_catalog():
     # NOTE: Iceberg only works with PyArrow 9 or above.
@@ -121,8 +120,8 @@ def test_get_catalog():
 
 
 @pytest.mark.skipif(
-    Version(pa.__version__) < Version("9.0.0"),
-    reason="PyIceberg depends on pyarrow>=9.0.0",
+    parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
+    reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
 )
 def test_plan_files():
     # NOTE: Iceberg only works with PyArrow 9 or above.
@@ -141,8 +140,8 @@ def test_plan_files():
 
 
 @pytest.mark.skipif(
-    Version(pa.__version__) < Version("9.0.0"),
-    reason="PyIceberg depends on pyarrow>=9.0.0",
+    parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
+    reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
 )
 def test_chunk_plan_files():
     # NOTE: Iceberg only works with PyArrow 9 or above.
@@ -168,8 +167,8 @@ def test_chunk_plan_files():
 
 
 @pytest.mark.skipif(
-    Version(pa.__version__) < Version("9.0.0"),
-    reason="PyIceberg depends on pyarrow>=9.0.0",
+    parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
+    reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
 )
 def test_get_read_tasks():
     # NOTE: Iceberg only works with PyArrow 9 or above.
@@ -189,8 +188,8 @@ def test_get_read_tasks():
 
 
 @pytest.mark.skipif(
-    Version(pa.__version__) < Version("9.0.0"),
-    reason="PyIceberg depends on pyarrow>=9.0.0",
+    parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
+    reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
 )
 def test_filtered_read():
     # NOTE: Iceberg only works with PyArrow 9 or above.
@@ -215,8 +214,8 @@ def test_filtered_read():
 
 
 @pytest.mark.skipif(
-    Version(pa.__version__) < Version("9.0.0"),
-    reason="PyIceberg depends on pyarrow>=9.0.0",
+    parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
+    reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
 )
 def test_read_basic():
     # NOTE: Iceberg only works with PyArrow 9 or above.

diff --git a/python/ray/data/tests/test_parquet.py b/python/ray/data/tests/test_parquet.py
@@ -9,9 +9,11 @@
 import pyarrow.dataset as pds
 import pyarrow.parquet as pq
 import pytest
+from pkg_resources import parse_version
 from pytest_lazyfixture import lazy_fixture
 
 import ray
+from ray._private.utils import _get_pyarrow_version
 from ray.air.util.tensor_extensions.arrow import ArrowTensorType, ArrowTensorTypeV2
 from ray.data import Schema
 from ray.data._internal.datasource.parquet_bulk_datasource import ParquetBulkDatasource
@@ -573,7 +575,7 @@ def test_parquet_read_partitioned_with_columns(ray_start_regular_shared, fs, dat
 # pyarrow does not support single path with partitioning,
 # this issue cannot be resolved by Ray data itself.
 @pytest.mark.skipif(
-    tuple(pa.__version__.split(".")) < ("7",),
+    parse_version(_get_pyarrow_version()) < parse_version("7.0.0"),
     reason="Old pyarrow behavior cannot be fixed.",
 )
 @pytest.mark.parametrize(
@@ -618,9 +620,19 @@ def test_parquet_read_partitioned_with_partition_filter(
         ),
     )
 
-    assert ds.columns() == ["x", "y", "z"]
+    assert ds.schema() == Schema(
+        pa.schema(
+            [
+                ("x", pa.string()),
+                ("y", pa.string()),
+                ("z", pa.float64()),
+            ]
+        )
+    )
+
     values = [[s["x"], s["y"], s["z"]] for s in ds.take()]
-    assert sorted(values) == [[0, "a", 0.1]]
+
+    assert sorted(values) == [["0", "a", 0.1]]
 
 
 def test_parquet_read_partitioned_explicit(ray_start_regular_shared, tmp_path):

diff --git a/python/requirements.txt b/python/requirements.txt
@@ -27,7 +27,7 @@ numpy>=1.20
 
 # pyarrow 18 causes macos build failures.
 # See https://github.com/ray-project/ray/pull/48300
-pyarrow >= 6.0.1
+pyarrow >= 9.0.0
 pyarrow <18; sys_platform == "darwin" and platform_machine == "x86_64"
 
 # ray[all]

diff --git a/python/setup.py b/python/setup.py
@@ -277,7 +277,13 @@ def get_packages(self):
             "fastapi",
             "watchfiles",
         ],
-        "tune": ["pandas", "tensorboardX>=1.9", "requests", *pyarrow_deps, "fsspec"],
+        "tune": [
+            "pandas",
+            "tensorboardX>=1.9",
+            "requests",
+            *pyarrow_deps,
+            "fsspec",
+        ],
     }
 
     # Ray Serve depends on the Ray dashboard components.