pytorch · ebsmothers · May 3, 2024 · Apr 27, 2024 · Apr 27, 2024 · Apr 27, 2024
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -357,10 +357,12 @@ def _setup_data(
         iterable datasets and streaming datasets are not supported.
         """
         world_size, rank = utils.get_world_size_and_rank()
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+
+        if isinstance(cfg_dataset.get(0), DictConfig):
-        if isinstance(cfg_dataset.get(0), DictConfig):
+        if isinstance(cfg_dataset, ListConfig):
-        if isinstance(cfg_dataset.get(0), DictConfig):
+        if isinstance(cfg_dataset, ListConfig):
+            ds = utils.MultiDataset(datasets=cfg_dataset, tokenizer=self._tokenizer)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=world_size,

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -320,10 +320,11 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+        if isinstance(cfg_dataset.get(0), DictConfig):
+            ds = utils.MultiDataset(datasets=cfg_dataset, tokenizer=self._tokenizer)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=1,

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -412,7 +412,12 @@ def _setup_data(
         iterable datasets and streaming datasets are not supported.
         """
         world_size, rank = utils.get_world_size_and_rank()
-        ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
+        if isinstance(cfg_dataset.get(0), DictConfig):
+            ds = utils.MultiDataset(datasets=cfg_dataset, tokenizer=self._tokenizer)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
         )

diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
@@ -297,10 +297,11 @@ def _setup_data(
         Map-style Datasets which fit into memory and an option for random shuffling.
         Samplers, iterable datasets, and streaming datasets are not supported.
         """
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+        if isinstance(cfg_dataset.get(0), DictConfig):
+            ds = utils.MultiDataset(datasets=cfg_dataset, tokenizer=self._tokenizer)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=1,

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -409,7 +409,12 @@ def _setup_data(
         iterable datasets and streaming datasets are not supported.
         """
         world_size, rank = utils.get_world_size_and_rank()
-        ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
+        if isinstance(cfg_dataset.get(0), DictConfig):
+            ds = utils.MultiDataset(datasets=cfg_dataset, tokenizer=self._tokenizer)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
         )

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -337,10 +337,11 @@ def _setup_data(
         Map-style Datasets which fit into memory and an option for random shuffling.
         Samplers, iterable datasets, and streaming datasets are not supported.
         """
-        ds = config.instantiate(
-            cfg_dataset,
-            tokenizer=self._tokenizer,
-        )
+        if isinstance(cfg_dataset.get(0), DictConfig):
+            ds = utils.MultiDataset(datasets=cfg_dataset, tokenizer=self._tokenizer)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
         sampler = DistributedSampler(
             ds,
             num_replicas=1,

diff --git a/tests/torchtune/datasets/test_alpaca_dataset.py b/tests/torchtune/datasets/test_alpaca_dataset.py
@@ -7,6 +7,7 @@
 from unittest.mock import patch
 
 import pytest
+from datasets import Dataset
 
 from tests.test_utils import get_assets_path
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
@@ -28,17 +29,19 @@ def test_label_no_masking(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "instruction": "Give three tips for staying healthy.",
-                "input": "",
-                "output": (
-                    "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
-                    "2. Exercise regularly to keep your body active and strong."
-                    "3. Get enough sleep and maintain a consistent sleep schedule."
-                ),
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "instruction": "Give three tips for staying healthy.",
+                    "input": "",
+                    "output": (
+                        "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
+                        "2. Exercise regularly to keep your body active and strong."
+                        "3. Get enough sleep and maintain a consistent sleep schedule."
+                    ),
+                }
+            ]
+        )
 
         alpaca_ds = alpaca_dataset(tokenizer=tokenizer)
         input, labels = alpaca_ds[0]
@@ -55,17 +58,19 @@ def test_label_masking(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "instruction": "Give three tips for staying healthy.",
-                "input": "",
-                "output": (
-                    "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
-                    "2. Exercise regularly to keep your body active and strong."
-                    "3. Get enough sleep and maintain a consistent sleep schedule."
-                ),
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "instruction": "Give three tips for staying healthy.",
+                    "input": "",
+                    "output": (
+                        "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
+                        "2. Exercise regularly to keep your body active and strong."
+                        "3. Get enough sleep and maintain a consistent sleep schedule."
+                    ),
+                }
+            ]
+        )
 
         alpaca_ds = alpaca_dataset(tokenizer=tokenizer, train_on_input=False)
 
@@ -90,17 +95,19 @@ def test_alpaca_clean(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "instruction": "Give three tips for staying healthy.",
-                "input": "",
-                "output": (
-                    "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
-                    "2. Exercise regularly to keep your body active and strong."
-                    "3. Get enough sleep and maintain a consistent sleep schedule."
-                ),
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "instruction": "Give three tips for staying healthy.",
+                    "input": "",
+                    "output": (
+                        "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables."
+                        "2. Exercise regularly to keep your body active and strong."
+                        "3. Get enough sleep and maintain a consistent sleep schedule."
+                    ),
+                }
+            ]
+        )
 
         alpaca_ds = alpaca_cleaned_dataset(tokenizer=tokenizer)
         input, labels = alpaca_ds[0]

diff --git a/tests/torchtune/datasets/test_chat_dataset.py b/tests/torchtune/datasets/test_chat_dataset.py
@@ -7,10 +7,10 @@
 from unittest import mock
 
 import pytest
+from datasets import Dataset
 from tests.test_utils import DummyTokenizer
 from torchtune.data import Message
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
-
 from torchtune.datasets import ChatDataset
 
 
@@ -31,9 +31,11 @@ def format(
         formats = {"system": cls.system, "user": cls.user, "assistant": cls.assistant}
         formatted_dialogue = []
         for message in messages:
-            content = formats.get(message.role).format(content=message.content)
+            content = formats.get(message["role"]).format(content=message["content"])
             formatted_dialogue.append(
-                Message(role=message.role, content=content, masked=message.masked),
+                Message(
+                    role=message["role"], content=content, masked=message["masked"]
+                ),
             )
         return formatted_dialogue
 
@@ -57,26 +59,30 @@ def dialogue(self):
         return [
             {
                 "dialogue": [
-                    Message(
-                        role="system", content="You are an AI assistant.", masked=True
-                    ),
-                    Message(
-                        role="user", content="What is the meaning of life?", masked=True
-                    ),
-                    Message(
-                        role="assistant",
-                        content="The meaning of life is 42.",
-                        masked=False,
-                    ),
-                    Message(role="user", content="That's ridiculous.", masked=True),
-                    Message(role="assistant", content="I agree.", masked=False),
+                    {
+                        "role": "system",
+                        "content": "You are an AI assistant.",
+                        "masked": True,
+                    },
+                    {
+                        "role": "user",
+                        "content": "What is the meaning of life?",
+                        "masked": True,
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "The meaning of life is 42.",
+                        "masked": False,
+                    },
+                    {"role": "user", "content": "That's ridiculous.", "masked": True},
+                    {"role": "assistant", "content": "I agree.", "masked": False},
                 ],
             },
         ]
 
     @mock.patch("torchtune.datasets._chat.load_dataset")
     def test_get_item(self, mock_load_dataset, chat_format, dialogue):
-        mock_load_dataset.return_value = dialogue
+        mock_load_dataset.return_value = Dataset.from_list(dialogue)
         expected_tokenized_prompts = [
             [
                 0,
@@ -114,15 +120,7 @@ def test_get_item(self, mock_load_dataset, chat_format, dialogue):
         prompt_lengths = (15, 5)
         expected_labels = [
             [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[0]
-            + [
-                3,
-                7,
-                2,
-                4,
-                2,
-                3,
-                -1,
-            ]
+            + [3, 7, 2, 4, 2, 3, -1]
             + [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[1]
             + [1, 6, -1]
         ]

diff --git a/tests/torchtune/datasets/test_grammar_dataset.py b/tests/torchtune/datasets/test_grammar_dataset.py
@@ -7,6 +7,7 @@
 from unittest.mock import patch
 
 import pytest
+from datasets import Dataset
 
 from tests.test_utils import get_assets_path
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
@@ -29,12 +30,14 @@ def test_label_no_masking(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "input": "Bitcoin is for $7,094 this morning, which CoinDesk says.",
-                "output": "Bitcoin goes for $7,094 this morning, according to CoinDesk.",
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "input": "Bitcoin is for $7,094 this morning, which CoinDesk says.",
+                    "output": "Bitcoin goes for $7,094 this morning, according to CoinDesk.",
+                }
+            ]
+        )
 
         grammar_ds = grammar_dataset(tokenizer=tokenizer, train_on_input=True)
         input, labels = grammar_ds[0]
@@ -51,12 +54,14 @@ def test_label_masking(self, load_dataset, tokenizer):
         """
 
         # mock the call to HF datasets
-        load_dataset.return_value = [
-            {
-                "input": "Bitcoin is for $7,094 this morning, which CoinDesk says.",
-                "output": "Bitcoin goes for $7,094 this morning, according to CoinDesk.",
-            }
-        ]
+        load_dataset.return_value = Dataset.from_list(
+            [
+                {
+                    "input": "Bitcoin is for $7,094 this morning, which CoinDesk says.",
+                    "output": "Bitcoin goes for $7,094 this morning, according to CoinDesk.",
+                }
+            ]
+        )
 
         grammar_ds = grammar_dataset(tokenizer=tokenizer)
 

diff --git a/tests/torchtune/datasets/test_instruct_dataset.py b/tests/torchtune/datasets/test_instruct_dataset.py
@@ -6,10 +6,9 @@
 
 from unittest import mock
 
+from datasets import Dataset
 from tests.test_utils import DummyTokenizer
-
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
-
 from torchtune.datasets import InstructDataset
 
 
@@ -60,6 +59,34 @@ class TestInstructDataset:
             -1,
         ],
         [0, 12, 4, 2, 2, 12, 10, 6, 4, 2, 2, 6, 10, 9, 1, 6, 4, 4, 3, 6, 2, 4, -1],
+        [
+            0,
+            12,
+            4,
+            2,
+            3,
+            2,
+            12,
+            10,
+            6,
+            4,
+            2,
+            3,
+            2,
+            6,
+            10,
+            9,
+            1,
+            5,
+            4,
+            4,
+            3,
+            6,
+            2,
+            4,
+            -1,
+        ],
+        [0, 12, 4, 2, 2, 12, 10, 6, 4, 2, 2, 6, 10, 9, 1, 6, 4, 4, 3, 6, 2, 4, -1],
     ]
 
     def get_samples(self):
@@ -78,7 +105,7 @@ def get_samples(self):
 
     @mock.patch("torchtune.datasets._instruct.load_dataset")
     def test_get_item_no_train_on_input(self, mock_load_dataset):
-        mock_load_dataset.return_value = self.get_samples()
+        mock_load_dataset.return_value = Dataset.from_list(self.get_samples())
         prompt_lengths = (16, 14)
         expected_labels = [
             [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[0]
@@ -104,7 +131,7 @@ def test_get_item_no_train_on_input(self, mock_load_dataset):
 
     @mock.patch("torchtune.datasets._instruct.load_dataset")
     def test_get_item_train_on_input(self, mock_load_dataset):
-        mock_load_dataset.return_value = self.get_samples()
+        mock_load_dataset.return_value = Dataset.from_list(self.get_samples())
         expected_labels = self.expected_tokenized_prompts
 
         dataset = InstructDataset(