Skip to content

Commit 031d2e1

Browse files
data: remove deprecated read_parquet_bulk API
1 parent 562c233 commit 031d2e1

File tree

7 files changed

+2
-334
lines changed

7 files changed

+2
-334
lines changed

ci/lint/pydoclint-baseline.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1317,7 +1317,6 @@ python/ray/data/read_api.py
13171317
DOC101: Function `read_bigquery`: Docstring contains fewer arguments than in function signature.
13181318
DOC103: Function `read_bigquery`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [query: Optional[str]].
13191319
DOC103: Function `read_parquet`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**arrow_parquet_args: ]. Arguments in the docstring but not in the function signature: [arrow_parquet_args: ].
1320-
DOC103: Function `read_parquet_bulk`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**arrow_parquet_args: ]. Arguments in the docstring but not in the function signature: [arrow_parquet_args: ].
13211320
DOC103: Function `read_json`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**arrow_json_args: ]. Arguments in the docstring but not in the function signature: [arrow_json_args: ].
13221321
DOC103: Function `read_csv`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**arrow_csv_args: ]. Arguments in the docstring but not in the function signature: [arrow_csv_args: ].
13231322
DOC101: Function `read_text`: Docstring contains fewer arguments than in function signature.

doc/source/data/api/input_output.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ Parquet
3232
:toctree: doc/
3333

3434
read_parquet
35-
read_parquet_bulk
3635
Dataset.write_parquet
3736

3837
CSV

python/ray/data/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@
6666
read_mongo,
6767
read_numpy,
6868
read_parquet,
69-
read_parquet_bulk,
7069
read_snowflake,
7170
read_sql,
7271
read_text,
@@ -175,7 +174,6 @@
175174
"read_numpy",
176175
"read_mongo",
177176
"read_parquet",
178-
"read_parquet_bulk",
179177
"read_snowflake",
180178
"read_sql",
181179
"read_tfrecords",

python/ray/data/_internal/datasource/parquet_bulk_datasource.py

Lines changed: 0 additions & 51 deletions
This file was deleted.

python/ray/data/_internal/datasource/parquet_datasource.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def combine_predicates(
281281
class ParquetDatasource(Datasource):
282282
"""Parquet datasource, for reading and writing Parquet files.
283283
284-
The primary difference from ParquetBulkDatasource is that this uses
284+
The primary difference from read_parquet is that this uses
285285
PyArrow's `ParquetDataset` abstraction for dataset reads, and thus offers
286286
automatic Arrow dataset schema inference and row count collection at the
287287
cost of some potential performance and/or compatibility penalties.

python/ray/data/read_api.py

Lines changed: 1 addition & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
from ray.data._internal.datasource.mcap_datasource import MCAPDatasource, TimeRange
5151
from ray.data._internal.datasource.mongo_datasource import MongoDatasource
5252
from ray.data._internal.datasource.numpy_datasource import NumpyDatasource
53-
from ray.data._internal.datasource.parquet_bulk_datasource import ParquetBulkDatasource
5453
from ray.data._internal.datasource.parquet_datasource import ParquetDatasource
5554
from ray.data._internal.datasource.range_datasource import RangeDatasource
5655
from ray.data._internal.datasource.sql_datasource import SQLDatasource
@@ -100,12 +99,11 @@
10099
)
101100
from ray.data.datasource.file_meta_provider import (
102101
DefaultFileMetadataProvider,
103-
FastFileMetadataProvider,
104102
FileMetadataProvider,
105103
)
106104
from ray.data.datasource.partitioning import Partitioning
107105
from ray.types import ObjectRef
108-
from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI
106+
from ray.util.annotations import DeveloperAPI, PublicAPI
109107
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
110108

111109
if TYPE_CHECKING:
@@ -1224,155 +1222,6 @@ class string
12241222
)
12251223

12261224

1227-
@Deprecated
1228-
def read_parquet_bulk(
1229-
paths: Union[str, List[str]],
1230-
*,
1231-
filesystem: Optional["pyarrow.fs.FileSystem"] = None,
1232-
columns: Optional[List[str]] = None,
1233-
parallelism: int = -1,
1234-
num_cpus: Optional[float] = None,
1235-
num_gpus: Optional[float] = None,
1236-
memory: Optional[float] = None,
1237-
ray_remote_args: Dict[str, Any] = None,
1238-
arrow_open_file_args: Optional[Dict[str, Any]] = None,
1239-
tensor_column_schema: Optional[Dict[str, Tuple[np.dtype, Tuple[int, ...]]]] = None,
1240-
meta_provider: Optional[BaseFileMetadataProvider] = None,
1241-
partition_filter: Optional[PathPartitionFilter] = None,
1242-
shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None,
1243-
include_paths: bool = False,
1244-
file_extensions: Optional[List[str]] = ParquetBulkDatasource._FILE_EXTENSIONS,
1245-
concurrency: Optional[int] = None,
1246-
override_num_blocks: Optional[int] = None,
1247-
**arrow_parquet_args,
1248-
) -> Dataset:
1249-
"""Create :class:`~ray.data.Dataset` from parquet files without reading metadata.
1250-
1251-
Use :meth:`~ray.data.read_parquet` for most cases.
1252-
1253-
Use :meth:`~ray.data.read_parquet_bulk` if all the provided paths point to files
1254-
and metadata fetching using :meth:`~ray.data.read_parquet` takes too long or the
1255-
parquet files do not all have a unified schema.
1256-
1257-
Performance slowdowns are possible when using this method with parquet files that
1258-
are very large.
1259-
1260-
.. warning::
1261-
1262-
Only provide file paths as input (i.e., no directory paths). An
1263-
OSError is raised if one or more paths point to directories. If your
1264-
use-case requires directory paths, use :meth:`~ray.data.read_parquet`
1265-
instead.
1266-
1267-
Examples:
1268-
Read multiple local files. You should always provide only input file paths
1269-
(i.e. no directory paths) when known to minimize read latency.
1270-
1271-
>>> ray.data.read_parquet_bulk( # doctest: +SKIP
1272-
... ["/path/to/file1", "/path/to/file2"])
1273-
1274-
Args:
1275-
paths: A single file path or a list of file paths.
1276-
filesystem: The PyArrow filesystem
1277-
implementation to read from. These filesystems are
1278-
specified in the
1279-
`PyArrow docs <https://arrow.apache.org/docs/python/api/\
1280-
filesystems.html#filesystem-implementations>`_.
1281-
Specify this parameter if you need to provide specific configurations to
1282-
the filesystem. By default, the filesystem is automatically selected based
1283-
on the scheme of the paths. For example, if the path begins with ``s3://``,
1284-
the `S3FileSystem` is used.
1285-
columns: A list of column names to read. Only the
1286-
specified columns are read during the file scan.
1287-
parallelism: This argument is deprecated. Use ``override_num_blocks`` argument.
1288-
num_cpus: The number of CPUs to reserve for each parallel read worker.
1289-
num_gpus: The number of GPUs to reserve for each parallel read worker. For
1290-
example, specify `num_gpus=1` to request 1 GPU for each parallel read
1291-
worker.
1292-
memory: The heap memory in bytes to reserve for each parallel read worker.
1293-
ray_remote_args: kwargs passed to :func:`ray.remote` in the read tasks.
1294-
arrow_open_file_args: kwargs passed to
1295-
`pyarrow.fs.FileSystem.open_input_file <https://arrow.apache.org/docs/\
1296-
python/generated/pyarrow.fs.FileSystem.html\
1297-
#pyarrow.fs.FileSystem.open_input_file>`_.
1298-
when opening input files to read.
1299-
tensor_column_schema: A dict of column name to PyArrow dtype and shape
1300-
mappings for converting a Parquet column containing serialized
1301-
tensors (ndarrays) as their elements to PyArrow tensors. This function
1302-
assumes that the tensors are serialized in the raw
1303-
NumPy array format in C-contiguous order (e.g. via
1304-
`arr.tobytes()`).
1305-
meta_provider: [Deprecated] A :ref:`file metadata provider <metadata_provider>`.
1306-
Custom metadata providers may be able to resolve file metadata more quickly
1307-
and/or accurately. In most cases, you do not need to set this. If ``None``,
1308-
this function uses a system-chosen implementation.
1309-
partition_filter: A
1310-
:class:`~ray.data.datasource.partitioning.PathPartitionFilter`. Use
1311-
with a custom callback to read only selected partitions of a dataset.
1312-
By default, this filters out any file paths whose file extension does not
1313-
match "*.parquet*".
1314-
shuffle: If setting to "files", randomly shuffle input files order before read.
1315-
If setting to :class:`~ray.data.FileShuffleConfig`, you can pass a seed to
1316-
shuffle the input files. Defaults to not shuffle with ``None``.
1317-
arrow_parquet_args: Other parquet read options to pass to PyArrow. For the full
1318-
set of arguments, see
1319-
the `PyArrow API <https://arrow.apache.org/docs/python/generated/\
1320-
pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_fragment>`_
1321-
include_paths: If ``True``, include the path to each file. File paths are
1322-
stored in the ``'path'`` column.
1323-
file_extensions: A list of file extensions to filter files by.
1324-
concurrency: The maximum number of Ray tasks to run concurrently. Set this
1325-
to control number of tasks to run concurrently. This doesn't change the
1326-
total number of tasks run or the total number of output blocks. By default,
1327-
concurrency is dynamically decided based on the available resources.
1328-
override_num_blocks: Override the number of output blocks from all read tasks.
1329-
By default, the number of output blocks is dynamically decided based on
1330-
input data size and available resources. You shouldn't manually set this
1331-
value in most cases.
1332-
1333-
Returns:
1334-
:class:`~ray.data.Dataset` producing records read from the specified paths.
1335-
"""
1336-
_emit_meta_provider_deprecation_warning(meta_provider)
1337-
1338-
warnings.warn(
1339-
"`read_parquet_bulk` is deprecated and will be removed after May 2025. Use "
1340-
"`read_parquet` instead.",
1341-
DeprecationWarning,
1342-
)
1343-
1344-
if meta_provider is None:
1345-
meta_provider = FastFileMetadataProvider()
1346-
read_table_args = _resolve_parquet_args(
1347-
tensor_column_schema,
1348-
**arrow_parquet_args,
1349-
)
1350-
if columns is not None:
1351-
read_table_args["columns"] = columns
1352-
1353-
datasource = ParquetBulkDatasource(
1354-
paths,
1355-
read_table_args=read_table_args,
1356-
filesystem=filesystem,
1357-
open_stream_args=arrow_open_file_args,
1358-
meta_provider=meta_provider,
1359-
partition_filter=partition_filter,
1360-
shuffle=shuffle,
1361-
include_paths=include_paths,
1362-
file_extensions=file_extensions,
1363-
)
1364-
return read_datasource(
1365-
datasource,
1366-
num_cpus=num_cpus,
1367-
num_gpus=num_gpus,
1368-
memory=memory,
1369-
parallelism=parallelism,
1370-
ray_remote_args=ray_remote_args,
1371-
concurrency=concurrency,
1372-
override_num_blocks=override_num_blocks,
1373-
)
1374-
1375-
13761225
@PublicAPI
13771226
def read_json(
13781227
paths: Union[str, List[str]],

0 commit comments

Comments
 (0)