From 545d1b35a449fb83715108e69731f4897360ffcf Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 30 Aug 2024 17:09:04 -0700 Subject: [PATCH] [data] add a comment explaining the bundling behavior for map_batches with default batch_size (#47433) When batch_size is not set, input blocks are will be not bundled up. Add a comment explaining this. See https://github.com/ray-project/ray/pull/29971 and https://github.com/ray-project/ray/pull/47363#issuecomment-2322305566 Signed-off-by: Hao Chen Signed-off-by: ujjawal-khare --- python/ray/data/dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index eafaeb3a60fa..bb8ae674cbe6 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -549,6 +549,13 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: specified ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent to a given map task. + If ``batch_size`` is set and each input block is smaller than the + ``batch_size``, Ray Data will bundle up many blocks as the input for one + task, until their total size is equal to or greater than the given + ``batch_size``. + If ``batch_size`` is not set, the bundling will not be performed. Each task + will receive only one input block. + .. seealso:: :meth:`~Dataset.iter_batches`