Skip to content

Commit

Permalink
[Data] Bump min version of PyArrow from 6.0 to 9.0 (#47040)
Browse files Browse the repository at this point in the history
Dropping Ray Data internal support for Arrow < 9.0.0 as of Ray 2.39.

This is necessary in avoiding over-compensating for issues resolved in old Arrow versions, like the ones surfacing in #48266.
  • Loading branch information
bveeramani authored Nov 1, 2024
1 parent cf85018 commit 0a779ad
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 47 deletions.
14 changes: 7 additions & 7 deletions .buildkite/data.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ depends_on:
- oss-ci-base_ml
steps:
# builds
- name: data6build
wanda: ci/docker/data6.build.wanda.yaml
- name: data9build
wanda: ci/docker/data9.build.wanda.yaml

- name: datalbuild
wanda: ci/docker/datal.build.wanda.yaml
Expand All @@ -28,7 +28,7 @@ steps:
wanda: ci/docker/datatfxbsl.build.wanda.yaml

# tests
- label: ":database: data: arrow 6 tests"
- label: ":database: data: arrow v9 tests"
tags:
- python
- data
Expand All @@ -38,11 +38,11 @@ steps:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/data/... //python/ray/air/... data
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}"
--worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--build-name data6build
--build-name data9build
--except-tags data_integration,doctest
depends_on: data6build
depends_on: data9build

- label: ":database: data: arrow 17 tests"
- label: ":database: data: arrow v17 tests"
tags:
- python
- data
Expand All @@ -56,7 +56,7 @@ steps:
--except-tags data_integration,doctest
depends_on: datalbuild

- label: ":database: data: arrow 17 {{matrix.python}} tests ({{matrix.worker_id}})"
- label: ":database: data: arrow v17 {{matrix.python}} tests ({{matrix.worker_id}})"
key: datal_python_tests
if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
tags:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "data6build"
name: "data9build"
froms: ["cr.ray.io/rayproject/oss-ci-base_ml"]
dockerfile: ci/docker/data.build.Dockerfile
srcs:
Expand All @@ -10,6 +10,6 @@ srcs:
- python/requirements/ml/data-requirements.txt
- python/requirements/ml/data-test-requirements.txt
build_args:
- ARROW_VERSION=6.*
- ARROW_VERSION=9.*
tags:
- cr.ray.io/rayproject/data6build
- cr.ray.io/rayproject/data9build
4 changes: 2 additions & 2 deletions ci/pipeline/determine_tests_to_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def get_commit_range():
or changed_file == "ci/docker/data.build.Dockerfile"
or changed_file == "ci/docker/data.build.wanda.yaml"
or changed_file == "ci/docker/datan.build.wanda.yaml"
or changed_file == "ci/docker/data6.build.wanda.yaml"
or changed_file == "ci/docker/data14.build.wanda.yaml"
or changed_file == "ci/docker/data9.build.wanda.yaml"
or changed_file == "ci/docker/datal.build.wanda.yaml"
):
RAY_CI_DATA_AFFECTED = 1
RAY_CI_ML_AFFECTED = 1
Expand Down
23 changes: 6 additions & 17 deletions python/ray/data/tests/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,14 @@ def hf_dataset():
return datasets.load_dataset("tweet_eval", "stance_climate")


def _arrow_sort_values(table: pyarrow.lib.Table) -> pyarrow.lib.Table:
"""
Sort an Arrow table by the values in the first column. Used for testing
compatibility with pyarrow 6 where `sort_by` does not exist. Inspired by:
https://stackoverflow.com/questions/70893521/how-to-sort-a-pyarrow-table
"""
by = [table.schema.names[0]] # grab first col_name
table_sorted_indexes = pyarrow.compute.bottom_k_unstable(
table, sort_keys=by, k=len(table)
)
table_sorted = table.take(table_sorted_indexes)
return table_sorted
def hfds_assert_equals(hfds: datasets.Dataset, ds: Dataset):
hfds_table = hfds.data.table
ds_table = pyarrow.concat_tables([ray.get(tbl) for tbl in ds.to_arrow_refs()])

sorting = [(name, "descending") for name in hfds_table.column_names]
hfds_table = hfds_table.sort_by(sorting)
ds_table = ds_table.sort_by(sorting)

def hfds_assert_equals(hfds: datasets.Dataset, ds: Dataset):
hfds_table = _arrow_sort_values(hfds.data.table)
ds_table = _arrow_sort_values(
pyarrow.concat_tables([ray.get(tbl) for tbl in ds.to_arrow_refs()])
)
assert hfds_table.equals(ds_table)


Expand Down
25 changes: 12 additions & 13 deletions python/ray/data/tests/test_iceberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import pyarrow as pa
import pytest
from packaging.version import Version
from pkg_resources import parse_version
from pyiceberg import catalog as pyi_catalog
from pyiceberg import expressions as pyi_expr
Expand Down Expand Up @@ -101,8 +100,8 @@ def pyiceberg_table():


@pytest.mark.skipif(
Version(pa.__version__) < Version("9.0.0"),
reason="PyIceberg depends on pyarrow>=9.0.0",
parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
)
def test_get_catalog():
# NOTE: Iceberg only works with PyArrow 9 or above.
Expand All @@ -121,8 +120,8 @@ def test_get_catalog():


@pytest.mark.skipif(
Version(pa.__version__) < Version("9.0.0"),
reason="PyIceberg depends on pyarrow>=9.0.0",
parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
)
def test_plan_files():
# NOTE: Iceberg only works with PyArrow 9 or above.
Expand All @@ -141,8 +140,8 @@ def test_plan_files():


@pytest.mark.skipif(
Version(pa.__version__) < Version("9.0.0"),
reason="PyIceberg depends on pyarrow>=9.0.0",
parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
)
def test_chunk_plan_files():
# NOTE: Iceberg only works with PyArrow 9 or above.
Expand All @@ -168,8 +167,8 @@ def test_chunk_plan_files():


@pytest.mark.skipif(
Version(pa.__version__) < Version("9.0.0"),
reason="PyIceberg depends on pyarrow>=9.0.0",
parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
)
def test_get_read_tasks():
# NOTE: Iceberg only works with PyArrow 9 or above.
Expand All @@ -189,8 +188,8 @@ def test_get_read_tasks():


@pytest.mark.skipif(
Version(pa.__version__) < Version("9.0.0"),
reason="PyIceberg depends on pyarrow>=9.0.0",
parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
)
def test_filtered_read():
# NOTE: Iceberg only works with PyArrow 9 or above.
Expand All @@ -215,8 +214,8 @@ def test_filtered_read():


@pytest.mark.skipif(
Version(pa.__version__) < Version("9.0.0"),
reason="PyIceberg depends on pyarrow>=9.0.0",
parse_version(_get_pyarrow_version()) < parse_version("14.0.0"),
reason="PyIceberg 0.7.0 fails on pyarrow <= 14.0.0",
)
def test_read_basic():
# NOTE: Iceberg only works with PyArrow 9 or above.
Expand Down
18 changes: 15 additions & 3 deletions python/ray/data/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
import pyarrow.dataset as pds
import pyarrow.parquet as pq
import pytest
from pkg_resources import parse_version
from pytest_lazyfixture import lazy_fixture

import ray
from ray._private.utils import _get_pyarrow_version
from ray.air.util.tensor_extensions.arrow import ArrowTensorType, ArrowTensorTypeV2
from ray.data import Schema
from ray.data._internal.datasource.parquet_bulk_datasource import ParquetBulkDatasource
Expand Down Expand Up @@ -573,7 +575,7 @@ def test_parquet_read_partitioned_with_columns(ray_start_regular_shared, fs, dat
# pyarrow does not support single path with partitioning,
# this issue cannot be resolved by Ray data itself.
@pytest.mark.skipif(
tuple(pa.__version__.split(".")) < ("7",),
parse_version(_get_pyarrow_version()) < parse_version("7.0.0"),
reason="Old pyarrow behavior cannot be fixed.",
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -618,9 +620,19 @@ def test_parquet_read_partitioned_with_partition_filter(
),
)

assert ds.columns() == ["x", "y", "z"]
assert ds.schema() == Schema(
pa.schema(
[
("x", pa.string()),
("y", pa.string()),
("z", pa.float64()),
]
)
)

values = [[s["x"], s["y"], s["z"]] for s in ds.take()]
assert sorted(values) == [[0, "a", 0.1]]

assert sorted(values) == [["0", "a", 0.1]]


def test_parquet_read_partitioned_explicit(ray_start_regular_shared, tmp_path):
Expand Down
2 changes: 1 addition & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ numpy>=1.20

# pyarrow 18 causes macos build failures.
# See https://github.com/ray-project/ray/pull/48300
pyarrow >= 6.0.1
pyarrow >= 9.0.0
pyarrow <18; sys_platform == "darwin" and platform_machine == "x86_64"

# ray[all]
Expand Down
8 changes: 7 additions & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,13 @@ def get_packages(self):
"fastapi",
"watchfiles",
],
"tune": ["pandas", "tensorboardX>=1.9", "requests", *pyarrow_deps, "fsspec"],
"tune": [
"pandas",
"tensorboardX>=1.9",
"requests",
*pyarrow_deps,
"fsspec",
],
}

# Ray Serve depends on the Ray dashboard components.
Expand Down

0 comments on commit 0a779ad

Please sign in to comment.