-
Notifications
You must be signed in to change notification settings - Fork 5.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[data][api] implement
HudiDataSource
(#46273)
Support read from Hudi table into Ray dataset. --------- Signed-off-by: Shiyan Xu <2701446+xushiyan@users.noreply.github.com>
- Loading branch information
Showing
10 changed files
with
282 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ Data('s)? | |
[Dd]iscretizer(s)? | ||
dtype | ||
[Gg]roupby | ||
[Hh]udi | ||
[Ii]ndexable | ||
[Ii]ngest | ||
[Ii]nqueue(s)? | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import logging | ||
import os | ||
from typing import Dict, Iterator, List, Optional | ||
|
||
from ray.data._internal.util import _check_import | ||
from ray.data.block import BlockMetadata | ||
from ray.data.datasource.datasource import Datasource, ReadTask | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class HudiDatasource(Datasource): | ||
"""Hudi datasource, for reading Apache Hudi table.""" | ||
|
||
def __init__( | ||
self, | ||
table_uri: str, | ||
storage_options: Optional[Dict[str, str]] = None, | ||
): | ||
_check_import(self, module="hudi", package="hudi-python") | ||
|
||
self._table_uri = table_uri | ||
self._storage_options = storage_options | ||
|
||
def get_read_tasks(self, parallelism: int) -> List["ReadTask"]: | ||
import pyarrow | ||
from hudi import HudiTable | ||
|
||
def _perform_read( | ||
table_uri: str, | ||
base_file_paths: List[str], | ||
options: Dict[str, str], | ||
) -> Iterator["pyarrow.Table"]: | ||
from hudi import HudiFileGroupReader | ||
|
||
for p in base_file_paths: | ||
file_group_reader = HudiFileGroupReader(table_uri, options) | ||
batch = file_group_reader.read_file_slice_by_base_file_path(p) | ||
yield pyarrow.Table.from_batches([batch]) | ||
|
||
hudi_table = HudiTable(self._table_uri, self._storage_options) | ||
|
||
reader_options = { | ||
**hudi_table.storage_options(), | ||
**hudi_table.hudi_options(), | ||
} | ||
|
||
schema = hudi_table.get_schema() | ||
read_tasks = [] | ||
for file_slices_split in hudi_table.split_file_slices(parallelism): | ||
if len(file_slices_split) == 0: | ||
# when the table is empty, this will be an empty split | ||
continue | ||
|
||
num_rows = 0 | ||
relative_paths = [] | ||
input_files = [] | ||
size_bytes = 0 | ||
for file_slice in file_slices_split: | ||
# A file slice in a Hudi table is a logical group of data files | ||
# within a physical partition. Records stored in a file slice | ||
# are associated with a commit on the Hudi table's timeline. | ||
# For more info, see https://hudi.apache.org/docs/file_layouts | ||
num_rows += file_slice.num_records | ||
relative_path = file_slice.base_file_relative_path() | ||
relative_paths.append(relative_path) | ||
full_path = os.path.join(self._table_uri, relative_path) | ||
input_files.append(full_path) | ||
size_bytes += file_slice.base_file_size | ||
|
||
metadata = BlockMetadata( | ||
num_rows=num_rows, | ||
schema=schema, | ||
input_files=input_files, | ||
size_bytes=size_bytes, | ||
exec_stats=None, | ||
) | ||
|
||
read_task = ReadTask( | ||
read_fn=lambda paths=relative_paths: _perform_read( | ||
self._table_uri, paths, reader_options | ||
), | ||
metadata=metadata, | ||
) | ||
read_tasks.append(read_task) | ||
|
||
return read_tasks | ||
|
||
def estimate_inmemory_data_size(self) -> Optional[int]: | ||
# TODO(xushiyan) add APIs to provide estimated in-memory size | ||
return None |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
import os | ||
import zipfile | ||
|
||
import pytest | ||
from packaging.version import parse as parse_version | ||
from pytest_lazyfixture import lazy_fixture | ||
|
||
import ray | ||
from ray._private.utils import _get_pyarrow_version | ||
from ray.data.datasource.path_util import ( | ||
_resolve_paths_and_filesystem, | ||
_unwrap_protocol, | ||
) | ||
from ray.data.tests.conftest import * # noqa | ||
from ray.data.tests.mock_http_server import * # noqa | ||
from ray.tests.conftest import * # noqa | ||
|
||
MIN_PYARROW_VERSION_FOR_HUDI = parse_version("11.0.0") | ||
_VER = _get_pyarrow_version() | ||
PYARROW_VERSION = parse_version(_VER) if _VER else None | ||
PYARROW_VERSION_MEETS_REQUIREMENT = ( | ||
PYARROW_VERSION and PYARROW_VERSION >= MIN_PYARROW_VERSION_FOR_HUDI | ||
) | ||
PYARROW_HUDI_TEST_SKIP_REASON = ( | ||
f"Hudi only supported if pyarrow >= {MIN_PYARROW_VERSION_FOR_HUDI}" | ||
) | ||
|
||
|
||
def _extract_testing_table(fixture_path: str, table_dir: str, target_dir: str) -> str: | ||
with zipfile.ZipFile(fixture_path, "r") as zip_ref: | ||
zip_ref.extractall(target_dir) | ||
return os.path.join(target_dir, table_dir) | ||
|
||
|
||
@pytest.mark.skipif( | ||
not PYARROW_VERSION_MEETS_REQUIREMENT, | ||
reason=PYARROW_HUDI_TEST_SKIP_REASON, | ||
) | ||
@pytest.mark.parametrize( | ||
"fs,data_path", | ||
[ | ||
(None, lazy_fixture("local_path")), | ||
(lazy_fixture("local_fs"), lazy_fixture("local_path")), | ||
], | ||
) | ||
def test_read_hudi_simple_cow_table(ray_start_regular_shared, fs, data_path): | ||
setup_data_path = _unwrap_protocol(data_path) | ||
target_testing_dir = os.path.join(setup_data_path, "test_hudi") | ||
fixture_path, _ = _resolve_paths_and_filesystem( | ||
"example://hudi-tables/0.x_cow_partitioned.zip", fs | ||
) | ||
target_table_path = _extract_testing_table( | ||
fixture_path[0], "trips_table", target_testing_dir | ||
) | ||
|
||
ds = ray.data.read_hudi(target_table_path) | ||
|
||
assert ds.schema().names == [ | ||
"_hoodie_commit_time", | ||
"_hoodie_commit_seqno", | ||
"_hoodie_record_key", | ||
"_hoodie_partition_path", | ||
"_hoodie_file_name", | ||
"ts", | ||
"uuid", | ||
"rider", | ||
"driver", | ||
"fare", | ||
"city", | ||
] | ||
assert ds.count() == 5 | ||
rows = ( | ||
ds.select_columns(["_hoodie_commit_time", "ts", "uuid", "fare"]) | ||
.sort("fare") | ||
.take_all() | ||
) | ||
assert rows == [ | ||
{ | ||
"_hoodie_commit_time": "20240402123035233", | ||
"ts": 1695115999911, | ||
"uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa", | ||
"fare": 17.85, | ||
}, | ||
{ | ||
"_hoodie_commit_time": "20240402123035233", | ||
"ts": 1695159649087, | ||
"uuid": "334e26e9-8355-45cc-97c6-c31daf0df330", | ||
"fare": 19.1, | ||
}, | ||
{ | ||
"_hoodie_commit_time": "20240402123035233", | ||
"ts": 1695091554788, | ||
"uuid": "e96c4396-3fad-413a-a942-4cb36106d721", | ||
"fare": 27.7, | ||
}, | ||
{ | ||
"_hoodie_commit_time": "20240402123035233", | ||
"ts": 1695516137016, | ||
"uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c", | ||
"fare": 34.15, | ||
}, | ||
{ | ||
"_hoodie_commit_time": "20240402144910683", | ||
"ts": 1695046462179, | ||
"uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00", | ||
"fare": 339.0, | ||
}, | ||
] | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
||
sys.exit(pytest.main(["-v", __file__])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters