|
50 | 50 | from ray.data._internal.datasource.mcap_datasource import MCAPDatasource, TimeRange |
51 | 51 | from ray.data._internal.datasource.mongo_datasource import MongoDatasource |
52 | 52 | from ray.data._internal.datasource.numpy_datasource import NumpyDatasource |
53 | | -from ray.data._internal.datasource.parquet_bulk_datasource import ParquetBulkDatasource |
54 | 53 | from ray.data._internal.datasource.parquet_datasource import ParquetDatasource |
55 | 54 | from ray.data._internal.datasource.range_datasource import RangeDatasource |
56 | 55 | from ray.data._internal.datasource.sql_datasource import SQLDatasource |
|
100 | 99 | ) |
101 | 100 | from ray.data.datasource.file_meta_provider import ( |
102 | 101 | DefaultFileMetadataProvider, |
103 | | - FastFileMetadataProvider, |
104 | 102 | FileMetadataProvider, |
105 | 103 | ) |
106 | 104 | from ray.data.datasource.partitioning import Partitioning |
107 | 105 | from ray.types import ObjectRef |
108 | | -from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI |
| 106 | +from ray.util.annotations import DeveloperAPI, PublicAPI |
109 | 107 | from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy |
110 | 108 |
|
111 | 109 | if TYPE_CHECKING: |
@@ -1224,155 +1222,6 @@ class string |
1224 | 1222 | ) |
1225 | 1223 |
|
1226 | 1224 |
|
1227 | | -@Deprecated |
1228 | | -def read_parquet_bulk( |
1229 | | - paths: Union[str, List[str]], |
1230 | | - *, |
1231 | | - filesystem: Optional["pyarrow.fs.FileSystem"] = None, |
1232 | | - columns: Optional[List[str]] = None, |
1233 | | - parallelism: int = -1, |
1234 | | - num_cpus: Optional[float] = None, |
1235 | | - num_gpus: Optional[float] = None, |
1236 | | - memory: Optional[float] = None, |
1237 | | - ray_remote_args: Dict[str, Any] = None, |
1238 | | - arrow_open_file_args: Optional[Dict[str, Any]] = None, |
1239 | | - tensor_column_schema: Optional[Dict[str, Tuple[np.dtype, Tuple[int, ...]]]] = None, |
1240 | | - meta_provider: Optional[BaseFileMetadataProvider] = None, |
1241 | | - partition_filter: Optional[PathPartitionFilter] = None, |
1242 | | - shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None, |
1243 | | - include_paths: bool = False, |
1244 | | - file_extensions: Optional[List[str]] = ParquetBulkDatasource._FILE_EXTENSIONS, |
1245 | | - concurrency: Optional[int] = None, |
1246 | | - override_num_blocks: Optional[int] = None, |
1247 | | - **arrow_parquet_args, |
1248 | | -) -> Dataset: |
1249 | | - """Create :class:`~ray.data.Dataset` from parquet files without reading metadata. |
1250 | | -
|
1251 | | - Use :meth:`~ray.data.read_parquet` for most cases. |
1252 | | -
|
1253 | | - Use :meth:`~ray.data.read_parquet_bulk` if all the provided paths point to files |
1254 | | - and metadata fetching using :meth:`~ray.data.read_parquet` takes too long or the |
1255 | | - parquet files do not all have a unified schema. |
1256 | | -
|
1257 | | - Performance slowdowns are possible when using this method with parquet files that |
1258 | | - are very large. |
1259 | | -
|
1260 | | - .. warning:: |
1261 | | -
|
1262 | | - Only provide file paths as input (i.e., no directory paths). An |
1263 | | - OSError is raised if one or more paths point to directories. If your |
1264 | | - use-case requires directory paths, use :meth:`~ray.data.read_parquet` |
1265 | | - instead. |
1266 | | -
|
1267 | | - Examples: |
1268 | | - Read multiple local files. You should always provide only input file paths |
1269 | | - (i.e. no directory paths) when known to minimize read latency. |
1270 | | -
|
1271 | | - >>> ray.data.read_parquet_bulk( # doctest: +SKIP |
1272 | | - ... ["/path/to/file1", "/path/to/file2"]) |
1273 | | -
|
1274 | | - Args: |
1275 | | - paths: A single file path or a list of file paths. |
1276 | | - filesystem: The PyArrow filesystem |
1277 | | - implementation to read from. These filesystems are |
1278 | | - specified in the |
1279 | | - `PyArrow docs <https://arrow.apache.org/docs/python/api/\ |
1280 | | - filesystems.html#filesystem-implementations>`_. |
1281 | | - Specify this parameter if you need to provide specific configurations to |
1282 | | - the filesystem. By default, the filesystem is automatically selected based |
1283 | | - on the scheme of the paths. For example, if the path begins with ``s3://``, |
1284 | | - the `S3FileSystem` is used. |
1285 | | - columns: A list of column names to read. Only the |
1286 | | - specified columns are read during the file scan. |
1287 | | - parallelism: This argument is deprecated. Use ``override_num_blocks`` argument. |
1288 | | - num_cpus: The number of CPUs to reserve for each parallel read worker. |
1289 | | - num_gpus: The number of GPUs to reserve for each parallel read worker. For |
1290 | | - example, specify `num_gpus=1` to request 1 GPU for each parallel read |
1291 | | - worker. |
1292 | | - memory: The heap memory in bytes to reserve for each parallel read worker. |
1293 | | - ray_remote_args: kwargs passed to :func:`ray.remote` in the read tasks. |
1294 | | - arrow_open_file_args: kwargs passed to |
1295 | | - `pyarrow.fs.FileSystem.open_input_file <https://arrow.apache.org/docs/\ |
1296 | | - python/generated/pyarrow.fs.FileSystem.html\ |
1297 | | - #pyarrow.fs.FileSystem.open_input_file>`_. |
1298 | | - when opening input files to read. |
1299 | | - tensor_column_schema: A dict of column name to PyArrow dtype and shape |
1300 | | - mappings for converting a Parquet column containing serialized |
1301 | | - tensors (ndarrays) as their elements to PyArrow tensors. This function |
1302 | | - assumes that the tensors are serialized in the raw |
1303 | | - NumPy array format in C-contiguous order (e.g. via |
1304 | | - `arr.tobytes()`). |
1305 | | - meta_provider: [Deprecated] A :ref:`file metadata provider <metadata_provider>`. |
1306 | | - Custom metadata providers may be able to resolve file metadata more quickly |
1307 | | - and/or accurately. In most cases, you do not need to set this. If ``None``, |
1308 | | - this function uses a system-chosen implementation. |
1309 | | - partition_filter: A |
1310 | | - :class:`~ray.data.datasource.partitioning.PathPartitionFilter`. Use |
1311 | | - with a custom callback to read only selected partitions of a dataset. |
1312 | | - By default, this filters out any file paths whose file extension does not |
1313 | | - match "*.parquet*". |
1314 | | - shuffle: If setting to "files", randomly shuffle input files order before read. |
1315 | | - If setting to :class:`~ray.data.FileShuffleConfig`, you can pass a seed to |
1316 | | - shuffle the input files. Defaults to not shuffle with ``None``. |
1317 | | - arrow_parquet_args: Other parquet read options to pass to PyArrow. For the full |
1318 | | - set of arguments, see |
1319 | | - the `PyArrow API <https://arrow.apache.org/docs/python/generated/\ |
1320 | | - pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_fragment>`_ |
1321 | | - include_paths: If ``True``, include the path to each file. File paths are |
1322 | | - stored in the ``'path'`` column. |
1323 | | - file_extensions: A list of file extensions to filter files by. |
1324 | | - concurrency: The maximum number of Ray tasks to run concurrently. Set this |
1325 | | - to control number of tasks to run concurrently. This doesn't change the |
1326 | | - total number of tasks run or the total number of output blocks. By default, |
1327 | | - concurrency is dynamically decided based on the available resources. |
1328 | | - override_num_blocks: Override the number of output blocks from all read tasks. |
1329 | | - By default, the number of output blocks is dynamically decided based on |
1330 | | - input data size and available resources. You shouldn't manually set this |
1331 | | - value in most cases. |
1332 | | -
|
1333 | | - Returns: |
1334 | | - :class:`~ray.data.Dataset` producing records read from the specified paths. |
1335 | | - """ |
1336 | | - _emit_meta_provider_deprecation_warning(meta_provider) |
1337 | | - |
1338 | | - warnings.warn( |
1339 | | - "`read_parquet_bulk` is deprecated and will be removed after May 2025. Use " |
1340 | | - "`read_parquet` instead.", |
1341 | | - DeprecationWarning, |
1342 | | - ) |
1343 | | - |
1344 | | - if meta_provider is None: |
1345 | | - meta_provider = FastFileMetadataProvider() |
1346 | | - read_table_args = _resolve_parquet_args( |
1347 | | - tensor_column_schema, |
1348 | | - **arrow_parquet_args, |
1349 | | - ) |
1350 | | - if columns is not None: |
1351 | | - read_table_args["columns"] = columns |
1352 | | - |
1353 | | - datasource = ParquetBulkDatasource( |
1354 | | - paths, |
1355 | | - read_table_args=read_table_args, |
1356 | | - filesystem=filesystem, |
1357 | | - open_stream_args=arrow_open_file_args, |
1358 | | - meta_provider=meta_provider, |
1359 | | - partition_filter=partition_filter, |
1360 | | - shuffle=shuffle, |
1361 | | - include_paths=include_paths, |
1362 | | - file_extensions=file_extensions, |
1363 | | - ) |
1364 | | - return read_datasource( |
1365 | | - datasource, |
1366 | | - num_cpus=num_cpus, |
1367 | | - num_gpus=num_gpus, |
1368 | | - memory=memory, |
1369 | | - parallelism=parallelism, |
1370 | | - ray_remote_args=ray_remote_args, |
1371 | | - concurrency=concurrency, |
1372 | | - override_num_blocks=override_num_blocks, |
1373 | | - ) |
1374 | | - |
1375 | | - |
1376 | 1225 | @PublicAPI |
1377 | 1226 | def read_json( |
1378 | 1227 | paths: Union[str, List[str]], |
|
0 commit comments