Skip to content

Commit

Permalink
fix pretrianing data sizes and weights (#9627)
Browse files Browse the repository at this point in the history
Signed-off-by: Chen Cui <chcui@nvidia.com>
  • Loading branch information
cuichenx authored Jul 6, 2024
1 parent 613e1f1 commit 7256db1
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion nemo/collections/llm/gpt/data/pre_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def __init__(
paths = [paths]
if weights is not None:
assert len(weights) == len(paths)
if len(weights) == 1:
# weights must be None if there is only one dataset
weights = None

self.paths = paths
self.weights = weights
Expand Down Expand Up @@ -90,7 +93,7 @@ def setup(self, stage: str = "") -> None:

if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
# This is to make sure we only have one epoch on every validation iteration
num_val_samples = None
num_val_samples = None if self.weights is None else 1

train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
Expand Down

0 comments on commit 7256db1

Please sign in to comment.