From 04fa22c957d16cdf98d09909c567d0bfaa7d9cca Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Sun, 9 Nov 2025 21:40:06 -0800 Subject: [PATCH 01/10] Improve Hugging Face reading instructions Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 56 +++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 72ce1c4f7106..78c88a082ffa 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -663,36 +663,54 @@ Ray Data interoperates with HuggingFace, PyTorch, and TensorFlow datasets. .. tab-item:: HuggingFace - To convert a HuggingFace Dataset to a Ray Datasets, call - :func:`~ray.data.from_huggingface`. This function accesses the underlying Arrow - table and converts it to a Dataset directly. + To read datasets from the Hugging Face Hub, use :func:`~ray.data.read_parquet` (or other + read functions) with the ``HfFileSystem`` filesystem. This approach provides better + performance and scalability than loading datasets into memory first. - .. warning:: - :class:`~ray.data.from_huggingface` only supports parallel reads in certain - instances, namely for untransformed public HuggingFace Datasets. For those datasets, - Ray Data uses `hosted parquet files `_ - to perform a distributed read; otherwise, Ray Data uses a single node read. - This behavior shouldn't be an issue with in-memory HuggingFace Datasets, but may cause a failure with - large memory-mapped HuggingFace Datasets. Additionally, HuggingFace `DatasetDict `_ and - `IterableDatasetDict `_ - objects aren't supported. + First, install the required dependencies: + + .. code-block:: console + + pip install datasets huggingface_hub + + Then, authenticate using your Hugging Face token: - .. This snippet below is skipped because of https://github.com/ray-project/ray/issues/54837. + .. code-block:: console + + export HF_TOKEN= + + For most Hugging Face datasets, the data is stored in Parquet files. You can directly + read from the dataset path: .. testcode:: :skipif: True - import ray.data - from datasets import load_dataset + import os + import ray + from huggingface_hub import HfFileSystem - hf_ds = load_dataset("wikitext", "wikitext-2-raw-v1") - ray_ds = ray.data.from_huggingface(hf_ds["train"]) - ray_ds.take(2) + ds = ray.data.read_parquet( + "hf://datasets/wikimedia/wikipedia", + file_extensions=["parquet"], + filesystem=HfFileSystem(token=os.environ["HF_TOKEN"]), + ) + + print(f"Dataset count: {ds.count()}") + print(ds.schema()) .. testoutput:: :options: +MOCK - [{'text': ''}, {'text': ' = Valkyria Chronicles III = \n'}] + Dataset count: 61614907 + Column Type + ------ ---- + id string + url string + title string + text string + + .. tip:: + .. tab-item:: PyTorch From 69fe7d71fb1ae95a1108603fd8f8f8a0f3810d5e Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Sun, 9 Nov 2025 21:53:27 -0800 Subject: [PATCH 02/10] updates Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 126 +++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 39 deletions(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 78c88a082ffa..819d8146faa3 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -652,65 +652,113 @@ Ray Data interoperates with distributed data processing frameworks like `Daft - Then, authenticate using your Hugging Face token: +For most Hugging Face datasets, the data is stored in Parquet files. You can directly +read from the dataset path: - .. code-block:: console +.. testcode:: + :skipif: True - export HF_TOKEN= + import os + import ray + from huggingface_hub import HfFileSystem - For most Hugging Face datasets, the data is stored in Parquet files. You can directly - read from the dataset path: + ds = ray.data.read_parquet( + "hf://datasets/wikimedia/wikipedia", + file_extensions=["parquet"], + filesystem=HfFileSystem(token=os.environ["HF_TOKEN"]), + ) - .. testcode:: - :skipif: True + print(f"Dataset count: {ds.count()}") + print(ds.schema()) - import os - import ray - from huggingface_hub import HfFileSystem +.. testoutput:: + :options: +MOCK - ds = ray.data.read_parquet( - "hf://datasets/wikimedia/wikipedia", - file_extensions=["parquet"], - filesystem=HfFileSystem(token=os.environ["HF_TOKEN"]), - ) + Dataset count: 61614907 + Column Type + ------ ---- + id string + url string + title string + text string - print(f"Dataset count: {ds.count()}") - print(ds.schema()) +If you need to filter by split (train, test, validation, etc.) or parse filenames, +you can use the ``datasets`` library to discover files: - .. testoutput:: - :options: +MOCK +.. testcode:: + :skipif: True - Dataset count: 61614907 - Column Type - ------ ---- - id string - url string - title string - text string + import os + import ray + import datasets + from huggingface_hub import HfFileSystem + + # Specify the dataset and the split name. + dataset_name = "wikimedia/wikipedia" + split = "train" + + # Fetch the dataset files. + base_path = f"hf://datasets/{dataset_name}" + patterns = datasets.data_files.get_data_patterns(base_path) + data_files_with_splits = datasets.data_files.DataFilesDict.from_patterns( + patterns, + base_path=base_path, + allowed_extensions=datasets.load.ALL_ALLOWED_EXTENSIONS, + ) + data_files = data_files_with_splits["train"] - .. tip:: + # Read those files into Ray Data. + ds = ray.data.read_parquet( + data_files, + file_extensions=["parquet"], + filesystem=HfFileSystem(token=os.environ["HF_TOKEN"]), + ) + + print(f"Dataset count: {ds.count()}") + +.. testoutput:: + :options: +MOCK + + Dataset count: 61614907 +.. tip:: + + For datasets that aren't in Parquet format, use the appropriate read function: + :func:`~ray.data.read_json` for JSON files, or :func:`~ray.data.read_binary_files` + for binary files like audio archives. + +For a complete example script that loads many different Hugging Face datasets, see +:download:`load_data_from_hf.py `. + +.. _loading_datasets_from_ml_libraries: + +Loading data from ML libraries +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ray Data interoperates with PyTorch and TensorFlow datasets. + +.. tab-set:: .. tab-item:: PyTorch From 0084637e312da9953c62c355c452667c8d20e282 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Sun, 9 Nov 2025 21:54:35 -0800 Subject: [PATCH 03/10] Minor Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 819d8146faa3..037eb96bd53d 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -754,7 +754,7 @@ For a complete example script that loads many different Hugging Face datasets, s .. _loading_datasets_from_ml_libraries: Loading data from ML libraries -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Ray Data interoperates with PyTorch and TensorFlow datasets. From ff132f566259e222228a7d8797c904385e4d4a61 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Sun, 9 Nov 2025 21:56:17 -0800 Subject: [PATCH 04/10] remove excess Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 47 -------------------------------- 1 file changed, 47 deletions(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 037eb96bd53d..a4f93ef49baa 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -703,53 +703,6 @@ read from the dataset path: title string text string -If you need to filter by split (train, test, validation, etc.) or parse filenames, -you can use the ``datasets`` library to discover files: - -.. testcode:: - :skipif: True - - import os - import ray - import datasets - from huggingface_hub import HfFileSystem - - # Specify the dataset and the split name. - dataset_name = "wikimedia/wikipedia" - split = "train" - - # Fetch the dataset files. - base_path = f"hf://datasets/{dataset_name}" - patterns = datasets.data_files.get_data_patterns(base_path) - data_files_with_splits = datasets.data_files.DataFilesDict.from_patterns( - patterns, - base_path=base_path, - allowed_extensions=datasets.load.ALL_ALLOWED_EXTENSIONS, - ) - data_files = data_files_with_splits["train"] - - # Read those files into Ray Data. - ds = ray.data.read_parquet( - data_files, - file_extensions=["parquet"], - filesystem=HfFileSystem(token=os.environ["HF_TOKEN"]), - ) - - print(f"Dataset count: {ds.count()}") - -.. testoutput:: - :options: +MOCK - - Dataset count: 61614907 - -.. tip:: - - For datasets that aren't in Parquet format, use the appropriate read function: - :func:`~ray.data.read_json` for JSON files, or :func:`~ray.data.read_binary_files` - for binary files like audio archives. - -For a complete example script that loads many different Hugging Face datasets, see -:download:`load_data_from_hf.py `. .. _loading_datasets_from_ml_libraries: From 708ea9661f12d73a2a7de69318658b962c6ffbfc Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 14 Nov 2025 00:43:13 -0800 Subject: [PATCH 05/10] Update Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index a4f93ef49baa..e8be49a3f09f 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -667,7 +667,8 @@ First, install the required dependencies: pip install huggingface_hub -Then, authenticate using your Hugging Face token: +Set your Hugging Face token to authenticate. While public datasets can be read without +a token, you will suffer aggressive rate limiting without a token. .. code-block:: console From 552584c08d1563028e93c0c958d71490c73f8c9d Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 14 Nov 2025 14:36:36 -0800 Subject: [PATCH 06/10] Update Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index e8be49a3f09f..99b4662084ff 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -668,7 +668,8 @@ First, install the required dependencies: pip install huggingface_hub Set your Hugging Face token to authenticate. While public datasets can be read without -a token, you will suffer aggressive rate limiting without a token. +a token, Hugging Face rate limits are more aggressive without a token. To read Hugging +Face datasets without a token, simply set the filesystem arguemnt to ``HfFileSystem()``. .. code-block:: console From 8f45067961465ebdd43f2d89cc3394a412b7d1f9 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 14 Nov 2025 14:39:57 -0800 Subject: [PATCH 07/10] Fix Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 99b4662084ff..f7567b1edc69 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -695,7 +695,6 @@ read from the dataset path: print(ds.schema()) .. testoutput:: - :options: +MOCK Dataset count: 61614907 Column Type From 4ee1b5286d876299719713253cb9aa332280f5d8 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Mon, 17 Nov 2025 22:41:08 -0800 Subject: [PATCH 08/10] fix Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index f7567b1edc69..a5c660f912d1 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -669,7 +669,7 @@ First, install the required dependencies: Set your Hugging Face token to authenticate. While public datasets can be read without a token, Hugging Face rate limits are more aggressive without a token. To read Hugging -Face datasets without a token, simply set the filesystem arguemnt to ``HfFileSystem()``. +Face datasets without a token, simply set the filesystem argument to ``HfFileSystem()``. .. code-block:: console From af8eec897aa15dc4cb9562698ebc26d5df8c0b90 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Tue, 18 Nov 2025 21:51:24 -0800 Subject: [PATCH 09/10] Removing code for testing purposes. Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 53 -------------------------------- 1 file changed, 53 deletions(-) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index a5c660f912d1..7b3d94b416b8 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -652,59 +652,6 @@ Ray Data interoperates with distributed data processing frameworks like `Daft - -For most Hugging Face datasets, the data is stored in Parquet files. You can directly -read from the dataset path: - -.. testcode:: - :skipif: True - - import os - import ray - from huggingface_hub import HfFileSystem - - ds = ray.data.read_parquet( - "hf://datasets/wikimedia/wikipedia", - file_extensions=["parquet"], - filesystem=HfFileSystem(token=os.environ["HF_TOKEN"]), - ) - - print(f"Dataset count: {ds.count()}") - print(ds.schema()) - -.. testoutput:: - - Dataset count: 61614907 - Column Type - ------ ---- - id string - url string - title string - text string - - .. _loading_datasets_from_ml_libraries: Loading data from ML libraries From d0cf6b5e19f95282c37d4c672620da6169613ab4 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Wed, 19 Nov 2025 12:37:25 -0800 Subject: [PATCH 10/10] add back part Signed-off-by: Robert Nishihara --- doc/source/data/loading-data.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 7b3d94b416b8..df80a4db68f1 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -652,6 +652,17 @@ Ray Data interoperates with distributed data processing frameworks like `Daft