Feat: Add support of multiple datasets in config (#889)

Co-authored-by: Rafi Ayub <33648637+RdoubleA@users.noreply.github.com> Co-authored-by: ebsmothers <ebs@meta.com>
pytorch · May 3, 2024 · d36e818 · d36e818
1 parent 9274c89
commit d36e818
Show file tree

Hide file tree

Showing 18 changed files with 374 additions and 133 deletions.
diff --git a/docs/source/api_ref_datasets.rst b/docs/source/api_ref_datasets.rst
@@ -46,3 +46,4 @@ Class representations for the above dataset builders.
 
     InstructDataset
     ChatDataset
+    ConcatDataset
diff --git a/docs/source/tutorials/datasets.rst b/docs/source/tutorials/datasets.rst
@@ -47,6 +47,25 @@ You could tweak :code:`max_seq_len` to achieve that directly from the config.
     # Original is 512
     max_seq_len: 256
 
+It is also possible to train on multiple datasets by combining them into a single :class:`~torchtune.datasets.ConcatDataset`. For example:
+
+.. code-block:: yaml
+
+  dataset:
+    - _component_: torchtune.datasets.instruct_dataset
+      source: vicgalle/alpaca-gpt4
+      template: AlpacaInstructTemplate
+      split: train
+      train_on_input: True
+    - _component_: torchtune.datasets.instruct_dataset
+      source: samsum
+      template: SummarizeTemplate
+      column_map: {"output": "summary"}
+      split: train
+      train_on_input: False
+
+The preceding snippet demonstrates how you can configure each individual dataset's parameters, then combine them into a single concatenated dataset for training.
+
 Customizing instruct templates
 ------------------------------
 

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -12,7 +12,7 @@
 from warnings import warn
 
 import torch
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 
 from torch import nn
 from torch.distributed import init_process_group
@@ -27,7 +27,7 @@
 from torch.utils.data import DataLoader, DistributedSampler
 
 from torchtune import config, modules, utils
-
+from torchtune.datasets import ConcatDataset
 from torchtune.recipe_interfaces import FTRecipeInterface
 from torchtune.utils.activations import apply_selective_activation_checkpointing
 
@@ -357,10 +357,16 @@ def _setup_data(
         iterable datasets and streaming datasets are not supported.
         """
         world_size, rank = utils.get_world_size_and_rank()
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+
+        if isinstance(cfg_dataset, ListConfig):
+            datasets = [
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
+                for single_cfg_dataset in cfg_dataset
+            ]
+            ds = ConcatDataset(datasets=datasets)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=world_size,

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -11,14 +11,14 @@
 from warnings import warn
 
 import torch
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 
 from torch import nn
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
 
 from torchtune import config, modules, utils
-
+from torchtune.datasets import ConcatDataset
 from torchtune.recipe_interfaces import FTRecipeInterface
 
 from tqdm import tqdm
@@ -320,10 +320,15 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+        if isinstance(cfg_dataset, ListConfig):
+            datasets = [
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
+                for single_cfg_dataset in cfg_dataset
+            ]
+            ds = ConcatDataset(datasets=datasets)
+        else:
+            ds = config.instantiate(config=cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=1,

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -12,7 +12,7 @@
 from warnings import warn
 
 import torch
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 
 from torch import nn
 from torch.distributed import destroy_process_group, init_process_group
@@ -26,6 +26,7 @@
 from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import config, modules, utils
 from torchtune.data import CROSS_ENTROPY_IGNORE_IDX
+from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft.peft_utils import (
     disable_adapter,
     get_adapter_params,
@@ -412,7 +413,16 @@ def _setup_data(
         iterable datasets and streaming datasets are not supported.
         """
         world_size, rank = utils.get_world_size_and_rank()
-        ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
+        if isinstance(cfg_dataset, ListConfig):
+            datasets = [
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
+                for single_cfg_dataset in cfg_dataset
+            ]
+            ds = ConcatDataset(datasets=datasets)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
         )

diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
@@ -11,13 +11,14 @@
 from warnings import warn
 
 import torch
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 
 from torch import nn
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import config, modules, utils
 from torchtune.data import CROSS_ENTROPY_IGNORE_IDX
+from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft.peft_utils import (
     disable_adapter,
     get_adapter_params,
@@ -297,10 +298,15 @@ def _setup_data(
         Map-style Datasets which fit into memory and an option for random shuffling.
         Samplers, iterable datasets, and streaming datasets are not supported.
         """
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+        if isinstance(cfg_dataset, ListConfig):
+            datasets = [
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
+                for single_cfg_dataset in cfg_dataset
+            ]
+            ds = ConcatDataset(datasets=datasets)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=1,

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -13,7 +13,7 @@
 from warnings import warn
 
 import torch
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 
 from torch import nn
 from torch.distributed import destroy_process_group, init_process_group
@@ -26,6 +26,7 @@
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import config, modules, utils
+from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft.peft_utils import (
     get_adapter_params,
     get_merged_lora_ckpt,
@@ -409,7 +410,16 @@ def _setup_data(
         iterable datasets and streaming datasets are not supported.
         """
         world_size, rank = utils.get_world_size_and_rank()
-        ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
+        if isinstance(cfg_dataset, ListConfig):
+            datasets = [
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
+                for single_cfg_dataset in cfg_dataset
+            ]
+            ds = ConcatDataset(datasets=datasets)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
         )

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -12,12 +12,13 @@
 from warnings import warn
 
 import torch
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 
 from torch import nn
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import config, modules, utils
+from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft.peft_utils import (
     get_adapter_params,
     get_merged_lora_ckpt,
@@ -337,10 +338,15 @@ def _setup_data(
         Map-style Datasets which fit into memory and an option for random shuffling.
         Samplers, iterable datasets, and streaming datasets are not supported.
         """
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+        if isinstance(cfg_dataset, ListConfig):
+            datasets = [
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
+                for single_cfg_dataset in cfg_dataset
+            ]
+            ds = ConcatDataset(datasets=datasets)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=1,

diff --git a/tests/torchtune/datasets/test_alpaca_dataset.py b/tests/torchtune/datasets/test_alpaca_dataset.py
@@ -7,6 +7,7 @@
 from unittest.mock import patch
 
 import pytest
+from datasets import Dataset
 
 from tests.test_utils import get_assets_path
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
@@ -28,17 +29,19 @@ def test_label_no_masking(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "instruction": "Give three tips for staying healthy.",
-                "input": "",
-                "output": (
-                    "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
-                    "2. Exercise regularly to keep your body active and strong."
-                    "3. Get enough sleep and maintain a consistent sleep schedule."
-                ),
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "instruction": "Give three tips for staying healthy.",
+                    "input": "",
+                    "output": (
+                        "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
+                        "2. Exercise regularly to keep your body active and strong."
+                        "3. Get enough sleep and maintain a consistent sleep schedule."
+                    ),
+                }
+            ]
+        )
 
         alpaca_ds = alpaca_dataset(tokenizer=tokenizer)
         input, labels = alpaca_ds[0]
@@ -55,17 +58,19 @@ def test_label_masking(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "instruction": "Give three tips for staying healthy.",
-                "input": "",
-                "output": (
-                    "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
-                    "2. Exercise regularly to keep your body active and strong."
-                    "3. Get enough sleep and maintain a consistent sleep schedule."
-                ),
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "instruction": "Give three tips for staying healthy.",
+                    "input": "",
+                    "output": (
+                        "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
+                        "2. Exercise regularly to keep your body active and strong."
+                        "3. Get enough sleep and maintain a consistent sleep schedule."
+                    ),
+                }
+            ]
+        )
 
         alpaca_ds = alpaca_dataset(tokenizer=tokenizer, train_on_input=False)
 
@@ -90,17 +95,19 @@ def test_alpaca_clean(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "instruction": "Give three tips for staying healthy.",
-                "input": "",
-                "output": (
-                    "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
-                    "2. Exercise regularly to keep your body active and strong."
-                    "3. Get enough sleep and maintain a consistent sleep schedule."
-                ),
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "instruction": "Give three tips for staying healthy.",
+                    "input": "",
+                    "output": (
+                        "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
+                        "2. Exercise regularly to keep your body active and strong."
+                        "3. Get enough sleep and maintain a consistent sleep schedule."
+                    ),
+                }
+            ]
+        )
 
         alpaca_ds = alpaca_cleaned_dataset(tokenizer=tokenizer)
         input, labels = alpaca_ds[0]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -46,3 +46,4 @@ Class representations for the above dataset builders.

		InstructDataset
		ChatDataset
		ConcatDataset