mosaicml · irenedea · Sep 12, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
@@ -478,7 +478,9 @@ def convert_text_to_mds(
     index_path = os.path.join(local_output_folder, 'index.json')
     with open(index_path, 'r') as index_file:
         if not json.load(index_file)['shards']:
-            raise DatasetTooSmallError()
+            raise DatasetTooSmallError(
+                reason='No shards were created when converting text to MDS.',
+            )
 
     # Write a done file with the args and object names
     write_done_file(local_output_folder, args_str, object_names)

@@ -73,6 +73,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     ALLOWED_RESPONSE_KEYS,
     ChatTemplateError,
     ConsecutiveRepeatedChatRolesError,
+    DatasetTooSmallError,
     IncorrectMessageKeyQuantityError,
     InvalidContentTypeError,
     InvalidExampleTypeError,
@@ -1033,7 +1034,24 @@ def build_from_streaming(
         *args: Any,
         **kwargs: Any,
     ) -> StreamingFinetuningDataset:
-        return self.streaming_dataset_class(*args, **kwargs)
+        dataset = self.streaming_dataset_class(*args, **kwargs)
+        num_canonical_nodes = dataset.num_canonical_nodes
+        num_samples = dataset.num_samples
+        if num_canonical_nodes is None:
+            num_physical_nodes = dist.get_world_size(
+            ) // dist.get_local_world_size()
+            if num_samples < num_physical_nodes:
+                raise DatasetTooSmallError(
+                    f'{num_samples=} is less than {dist.get_world_size() // dist.get_local_world_size()}, the number of physical nodes. ',
+                )
+
+        if num_canonical_nodes is not None and num_samples < num_canonical_nodes:
+            raise DatasetTooSmallError(
+                f'{num_samples=} is less than {num_canonical_nodes=}. ' +
+                'Please check your index.json file and ensure that your dataset has been written out correctly.'
+                + 'If this was intended, reduce num_canonical_nodes.',
+            )
+        return dataset
 
 
 dataset_constructor = DatasetConstructor()

@@ -376,9 +376,9 @@ def __init__(self, dataset_name: str, split: str) -> None:
 class DatasetTooSmallError(UserError):
     """Error thrown when the dataset is too small to be processed."""
 
-    def __init__(self) -> None:
-        message = f'Your dataset is too small and produced no complete samples during preprocessing. Please provide more data.'
-        super().__init__(message)
+    def __init__(self, reason: str) -> None:
+        message = f'Your dataset is too small and produced no complete samples or too few samples. Please provide more data. {reason}'
+        super().__init__(message, reason=reason)
 
 
 class RunTimeoutError(InternalError):

diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py
@@ -0,0 +1,44 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from contextlib import nullcontext
+from typing import Optional
+from unittest import mock
+
+import pytest
+
+from llmfoundry.data.finetuning.tasks import dataset_constructor
+from llmfoundry.utils.exceptions import DatasetTooSmallError
+
+
+@pytest.mark.parametrize('num_canonical_nodes', [None, 8, 2])
+def test_finetuning_streaming_dataset_too_small(
+    num_canonical_nodes: Optional[int],
+):
+    num_samples = 2
+
+    class MockDataset:
+
+        def __init__(self):
+            self.num_canonical_nodes = num_canonical_nodes
+            self.num_samples = num_samples
+
+    class MockDist:
+
+        def get_world_size(self):
+            return 32
+
+        def get_local_world_size(self):
+            return 8
+
+    result_context = nullcontext(
+    ) if num_canonical_nodes == 2 else pytest.raises(DatasetTooSmallError)
+    with result_context:
+        with mock.patch(
+            'llmfoundry.data.finetuning.tasks.dist',
+            new=MockDist(),
+        ):
+            with mock.patch(
+                'llmfoundry.data.finetuning.tasks.DatasetConstructor.streaming_dataset_class',
+                new=MockDataset,
+            ):
+                dataset_constructor.build_from_streaming()