From 86dc6abf1b58ba8119f40313a2768054bbb4a369 Mon Sep 17 00:00:00 2001 From: Sanjay Kumar Sakamuri Kamalakar Date: Wed, 13 Aug 2025 19:38:01 +0530 Subject: [PATCH 1/3] Update about_map_batch.mdx --- docs/source/about_map_batch.mdx | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/about_map_batch.mdx b/docs/source/about_map_batch.mdx index 4ebbdf9acaf..d92999bbd07 100644 --- a/docs/source/about_map_batch.mdx +++ b/docs/source/about_map_batch.mdx @@ -38,3 +38,22 @@ To make it valid, you have to drop one of the columns: >>> len(dataset_with_duplicates) 6 ``` +Alternatively, you can overwrite the existing column to achieve the same result. +For example, here’s how to duplicate every row in the dataset by overwriting column `"a"`: + +```py +>>> from datasets import Dataset +>>> dataset = Dataset.from_dict({"a": [0, 1, 2]}) +# Overwrites the existing "a" column with duplicated values +>>> duplicated_dataset = dataset.map( +... lambda batch: {"a": [x for x in batch["a"] for _ in range(2)]}, +... batched=True +... ) +>>> duplicated_dataset +Dataset({ + features: ['a'], + num_rows: 6 +}) +>>> duplicated_dataset["a"] +[0, 0, 1, 1, 2, 2] +``` From 7018bf9a00146542f0921ea92221f37c2c0aaccd Mon Sep 17 00:00:00 2001 From: Sanjay Kumar Sakamuri Kamalakar Date: Wed, 13 Aug 2025 19:46:26 +0530 Subject: [PATCH 2/3] Update about_map_batch.mdx --- docs/source/about_map_batch.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/about_map_batch.mdx b/docs/source/about_map_batch.mdx index d92999bbd07..6c33fed593f 100644 --- a/docs/source/about_map_batch.mdx +++ b/docs/source/about_map_batch.mdx @@ -44,7 +44,7 @@ For example, here’s how to duplicate every row in the dataset by overwriting c ```py >>> from datasets import Dataset >>> dataset = Dataset.from_dict({"a": [0, 1, 2]}) -# Overwrites the existing "a" column with duplicated values +# overwrites the existing "a" column with duplicated values >>> duplicated_dataset = dataset.map( ... lambda batch: {"a": [x for x in batch["a"] for _ in range(2)]}, ... batched=True From 0578f99bdaa3d9f80277a233ad12d7ecc0857028 Mon Sep 17 00:00:00 2001 From: Sanjay Kumar Sakamuri Kamalakar Date: Sat, 13 Sep 2025 22:39:12 +0530 Subject: [PATCH 3/3] fix: correct float feature generation in generate_examples --- benchmarks/utils.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index feb13d9c8fa..b973611288c 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -1,7 +1,5 @@ import timeit - import numpy as np - import datasets from datasets.arrow_writer import ArrowWriter from datasets.features.features import _ArrayXD @@ -15,28 +13,42 @@ def wrapper(*args, **kwargs): return delta wrapper.__name__ = func.__name__ - return wrapper def generate_examples(features: dict, num_examples=100, seq_shapes=None): dummy_data = [] seq_shapes = seq_shapes or {} + for i in range(num_examples): example = {} for col_id, (k, v) in enumerate(features.items()): + if isinstance(v, _ArrayXD): data = np.random.rand(*v.shape).astype(v.dtype) + elif isinstance(v, datasets.Value): if v.dtype == "string": data = "The small grey turtle was surprisingly fast when challenged." + elif "int" in v.dtype: + data = np.random.randint(0, 10, size=1).astype(v.dtype).item() + elif "float" in v.dtype: + data = np.random.rand(1).astype(v.dtype).item() else: - data = np.random.randint(10, size=1).astype(v.dtype).item() + raise TypeError(f"Unsupported dtype: {v.dtype}") + elif isinstance(v, datasets.Sequence): - while isinstance(v, datasets.Sequence): - v = v.feature - shape = seq_shapes[k] - data = np.random.rand(*shape).astype(v.dtype) + feature = v + while isinstance(feature, datasets.Sequence): + feature = feature.feature + shape = seq_shapes.get(k) + if shape is None: + raise ValueError(f"Shape for sequence feature '{k}' not provided in seq_shapes.") + data = np.random.rand(*shape).astype(feature.dtype) + + else: + raise TypeError(f"Unsupported feature type for key '{k}': {type(v)}") + example[k] = data dummy_data.append((i, example)) @@ -59,6 +71,9 @@ def generate_example_dataset(dataset_path, features, num_examples=100, seq_shape f"Error writing the dataset, wrote {num_final_examples} examples but should have written {num_examples}." ) - dataset = datasets.Dataset.from_file(filename=dataset_path, info=datasets.DatasetInfo(features=features)) + dataset = datasets.Dataset.from_file( + filename=dataset_path, + info=datasets.DatasetInfo(features=features) + ) return dataset