From 86dc6abf1b58ba8119f40313a2768054bbb4a369 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar Sakamuri Kamalakar <sksanjaykumar010307@gmail.com>
Date: Wed, 13 Aug 2025 19:38:01 +0530
Subject: [PATCH 1/3] Update about_map_batch.mdx

---
 docs/source/about_map_batch.mdx | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/source/about_map_batch.mdx b/docs/source/about_map_batch.mdx
index 4ebbdf9acaf..d92999bbd07 100644
--- a/docs/source/about_map_batch.mdx
+++ b/docs/source/about_map_batch.mdx
@@ -38,3 +38,22 @@ To make it valid, you have to drop one of the columns:
 >>> len(dataset_with_duplicates)
 6
 ```
+Alternatively, you can overwrite the existing column to achieve the same result.
+For example, here’s how to duplicate every row in the dataset by overwriting column `"a"`:
+
+```py
+>>> from datasets import Dataset
+>>> dataset = Dataset.from_dict({"a": [0, 1, 2]})
+# Overwrites the existing "a" column with duplicated values
+>>> duplicated_dataset = dataset.map(
+...     lambda batch: {"a": [x for x in batch["a"] for _ in range(2)]},
+...     batched=True
+... )
+>>> duplicated_dataset
+Dataset({
+    features: ['a'],
+    num_rows: 6
+})
+>>> duplicated_dataset["a"]
+[0, 0, 1, 1, 2, 2]
+```

From 7018bf9a00146542f0921ea92221f37c2c0aaccd Mon Sep 17 00:00:00 2001
From: Sanjay Kumar Sakamuri Kamalakar <sksanjaykumar010307@gmail.com>
Date: Wed, 13 Aug 2025 19:46:26 +0530
Subject: [PATCH 2/3] Update about_map_batch.mdx

---
 docs/source/about_map_batch.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/about_map_batch.mdx b/docs/source/about_map_batch.mdx
index d92999bbd07..6c33fed593f 100644
--- a/docs/source/about_map_batch.mdx
+++ b/docs/source/about_map_batch.mdx
@@ -44,7 +44,7 @@ For example, here’s how to duplicate every row in the dataset by overwriting c
 ```py
 >>> from datasets import Dataset
 >>> dataset = Dataset.from_dict({"a": [0, 1, 2]})
-# Overwrites the existing "a" column with duplicated values
+# overwrites the existing "a" column with duplicated values
 >>> duplicated_dataset = dataset.map(
 ...     lambda batch: {"a": [x for x in batch["a"] for _ in range(2)]},
 ...     batched=True

From 0578f99bdaa3d9f80277a233ad12d7ecc0857028 Mon Sep 17 00:00:00 2001
From: Sanjay Kumar Sakamuri Kamalakar <sksanjaykumar010307@gmail.com>
Date: Sat, 13 Sep 2025 22:39:12 +0530
Subject: [PATCH 3/3] fix: correct float feature generation in
 generate_examples

---
 benchmarks/utils.py | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index feb13d9c8fa..b973611288c 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -1,7 +1,5 @@
 import timeit
-
 import numpy as np
-
 import datasets
 from datasets.arrow_writer import ArrowWriter
 from datasets.features.features import _ArrayXD
@@ -15,28 +13,42 @@ def wrapper(*args, **kwargs):
         return delta
 
     wrapper.__name__ = func.__name__
-
     return wrapper
 
 
 def generate_examples(features: dict, num_examples=100, seq_shapes=None):
     dummy_data = []
     seq_shapes = seq_shapes or {}
+
     for i in range(num_examples):
         example = {}
         for col_id, (k, v) in enumerate(features.items()):
+
             if isinstance(v, _ArrayXD):
                 data = np.random.rand(*v.shape).astype(v.dtype)
+
             elif isinstance(v, datasets.Value):
                 if v.dtype == "string":
                     data = "The small grey turtle was surprisingly fast when challenged."
+                elif "int" in v.dtype:
+                    data = np.random.randint(0, 10, size=1).astype(v.dtype).item()
+                elif "float" in v.dtype:
+                    data = np.random.rand(1).astype(v.dtype).item()  
                 else:
-                    data = np.random.randint(10, size=1).astype(v.dtype).item()
+                    raise TypeError(f"Unsupported dtype: {v.dtype}")
+
             elif isinstance(v, datasets.Sequence):
-                while isinstance(v, datasets.Sequence):
-                    v = v.feature
-                shape = seq_shapes[k]
-                data = np.random.rand(*shape).astype(v.dtype)
+                feature = v
+                while isinstance(feature, datasets.Sequence):
+                    feature = feature.feature
+                shape = seq_shapes.get(k)
+                if shape is None:
+                    raise ValueError(f"Shape for sequence feature '{k}' not provided in seq_shapes.")
+                data = np.random.rand(*shape).astype(feature.dtype)
+
+            else:
+                raise TypeError(f"Unsupported feature type for key '{k}': {type(v)}")
+
             example[k] = data
 
         dummy_data.append((i, example))
@@ -59,6 +71,9 @@ def generate_example_dataset(dataset_path, features, num_examples=100, seq_shape
             f"Error writing the dataset, wrote {num_final_examples} examples but should have written {num_examples}."
         )
 
-    dataset = datasets.Dataset.from_file(filename=dataset_path, info=datasets.DatasetInfo(features=features))
+    dataset = datasets.Dataset.from_file(
+        filename=dataset_path,
+        info=datasets.DatasetInfo(features=features)
+    )
 
     return dataset