From 545d1b35a449fb83715108e69731f4897360ffcf Mon Sep 17 00:00:00 2001
From: Hao Chen <chenh1024@gmail.com>
Date: Fri, 30 Aug 2024 17:09:04 -0700
Subject: [PATCH] [data] add a comment explaining the bundling behavior for
 map_batches with default batch_size (#47433)

When batch_size is not set, input blocks are will be not bundled up.
Add a comment explaining this.
See https://github.com/ray-project/ray/pull/29971 and
https://github.com/ray-project/ray/pull/47363#issuecomment-2322305566

Signed-off-by: Hao Chen <chenh1024@gmail.com>
Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
---
 python/ray/data/dataset.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
index eafaeb3a60fa..bb8ae674cbe6 100644
--- a/python/ray/data/dataset.py
+++ b/python/ray/data/dataset.py
@@ -549,6 +549,13 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
             specified ``batch_size`` if ``batch_size`` doesn't evenly divide the
             block(s) sent to a given map task.
 
+            If ``batch_size`` is set and each input block is smaller than the
+            ``batch_size``, Ray Data will bundle up many blocks as the input for one
+            task, until their total size is equal to or greater than the given
+            ``batch_size``.
+            If ``batch_size`` is not set, the bundling will not be performed. Each task
+            will receive only one input block.
+
         .. seealso::
 
             :meth:`~Dataset.iter_batches`