From fab1bd97fee31633963d40577d4dc5846379e537 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Sun, 6 Oct 2024 23:52:58 -0700 Subject: [PATCH 1/7] move tests/lightning/{,_}io Signed-off-by: Alexandros Koumparoulis --- tests/lightning/{io => _io}/__init__.py | 0 tests/lightning/{io => _io}/test_api.py | 0 tests/lightning/{io => _io}/test_mixin.py | 0 tests/lightning/{io => _io}/test_state.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename tests/lightning/{io => _io}/__init__.py (100%) rename tests/lightning/{io => _io}/test_api.py (100%) rename tests/lightning/{io => _io}/test_mixin.py (100%) rename tests/lightning/{io => _io}/test_state.py (100%) diff --git a/tests/lightning/io/__init__.py b/tests/lightning/_io/__init__.py similarity index 100% rename from tests/lightning/io/__init__.py rename to tests/lightning/_io/__init__.py diff --git a/tests/lightning/io/test_api.py b/tests/lightning/_io/test_api.py similarity index 100% rename from tests/lightning/io/test_api.py rename to tests/lightning/_io/test_api.py diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/_io/test_mixin.py similarity index 100% rename from tests/lightning/io/test_mixin.py rename to tests/lightning/_io/test_mixin.py diff --git a/tests/lightning/io/test_state.py b/tests/lightning/_io/test_state.py similarity index 100% rename from tests/lightning/io/test_state.py rename to tests/lightning/_io/test_state.py From d1fb9c480f3a7613c38b8bcbdd87c6fb877681bb Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Sun, 6 Oct 2024 23:53:24 -0700 Subject: [PATCH 2/7] add microbatch calculator context manager Signed-off-by: Alexandros Koumparoulis --- tests/lightning/mcore_microbatch_utils.py | 25 +++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 tests/lightning/mcore_microbatch_utils.py diff --git a/tests/lightning/mcore_microbatch_utils.py b/tests/lightning/mcore_microbatch_utils.py new file mode 100644 index 000000000000..3f9080826b68 --- /dev/null +++ b/tests/lightning/mcore_microbatch_utils.py @@ -0,0 +1,25 @@ +import contextlib + +# @akoumparouli: use a context manager that saves/restores gbs/mbs when using +# reconfigure_num_microbatches_calculator to avoid interference between tests. +@contextlib.contextmanager +def reconfigure_num_microbatches_calculator_manager(*args, **kwargs): + import megatron.core.num_microbatches_calculator as mb_calc + # Store current mbs, gbs values + if not mb_calc._GLOBAL_NUM_MICROBATCHES_CALCULATOR is None: + _mbs = mb_calc.get_micro_batch_size() + _gbs = mb_calc.get_current_global_batch_size() + + # use user's settings + mb_calc.reconfigure_num_microbatches_calculator(*args, **kwargs) + else: + _mbs, _gbs = 1, 1 + + try: + # run user's code + yield + # @akoumparouli: no catch + finally: + # restore old mbs, gbs + if not mb_calc._GLOBAL_NUM_MICROBATCHES_CALCULATOR is None: + mb_calc.reconfigure_num_microbatches_calculator(0, None, _gbs, _mbs, data_parallel_size=1) From f305f80bea51387805804ebfd20c50302a0737be Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Sun, 6 Oct 2024 23:54:23 -0700 Subject: [PATCH 3/7] use microbatch calculator context manager Signed-off-by: Alexandros Koumparoulis --- tests/lightning/test_dist_ckpt.py | 148 +++++++++--------- tests/lightning/test_nemo_resume_from_ckpt.py | 53 ++++--- 2 files changed, 105 insertions(+), 96 deletions(-) diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py index e6ea381fdf0b..4884dd5b33a6 100644 --- a/tests/lightning/test_dist_ckpt.py +++ b/tests/lightning/test_dist_ckpt.py @@ -24,7 +24,6 @@ def set_env(): import pytest import pytorch_lightning as pl import torch -from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator import nemo.lightning as nl from nemo.collections import llm @@ -43,12 +42,10 @@ def _get_last_checkpoint_dir(model: pl.LightningModule, suffix: str = '') -> Pat return f'epoch={model.trainer.current_epoch - 1}-step={model.trainer.max_steps - 1}{suffix}' -def get_model_and_data(): - micro_batch_size = 2 - global_batch_size = 2 +def get_model_and_data(mbs=2, gbs=2): seq_length = 128 data = llm.MockDataModule( - seq_length=seq_length, micro_batch_size=micro_batch_size, global_batch_size=global_batch_size + seq_length=seq_length, micro_batch_size=mbs, global_batch_size=gbs ) config = llm.GPTConfig( @@ -59,13 +56,6 @@ def get_model_and_data(): seq_length=seq_length, apply_query_key_layer_scaling=1, ) - reconfigure_num_microbatches_calculator( - 0, - None, - global_batch_size, - micro_batch_size, - data_parallel_size=1, - ) return llm.GPTModel(config, tokenizer=data.tokenizer), data @@ -76,21 +66,28 @@ def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path): set_env() assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '1' - model, data = get_model_and_data() - - strategy = _get_strategy() - - trainer = nl.Trainer( - devices=1, - accelerator="gpu", - strategy=strategy, - enable_checkpointing=True, - max_steps=2, - default_root_dir=str(tmp_path), - logger=False, - ) - - trainer.fit(model, data) + gbs, mbs = 2, 2 + model, data = get_model_and_data(mbs, gbs) + from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager + with reconfigure_num_microbatches_calculator_manager(0, + None, + gbs, + mbs, + data_parallel_size=1): + + strategy = _get_strategy() + + trainer = nl.Trainer( + devices=1, + accelerator="gpu", + strategy=strategy, + enable_checkpointing=True, + max_steps=2, + default_root_dir=str(tmp_path), + logger=False, + ) + + trainer.fit(model, data) assert isinstance(trainer.strategy.checkpoint_io, MegatronCheckpointIO) # Ckpt path doesn't contain the .ckpt suffix @@ -104,51 +101,58 @@ def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path): def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path): set_env() assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '1' - model, data = get_model_and_data() - - sync_ckpt_dir = tmp_path / 'sync_checkpoints' - async_ckpt_dir = tmp_path / 'async_checkpoints' - - sync_checkpoint_io = MegatronCheckpointIO('torch_dist') - async_checkpoint_io = AsyncFinalizableCheckpointIO(MegatronCheckpointIO('torch_dist', async_save=True)) - - # dummy_trainer just to initialize NCCL - dummy_trainer = pl.Trainer( - devices=1, - logger=False, - max_steps=2, - strategy=_get_strategy(), - ) - dummy_trainer.fit(model, data) - strategy = _get_strategy() - tmp_path = strategy.broadcast(tmp_path) - - ## reset the model and data and train with sync checkpointing - model, data = get_model_and_data() - sync_test_trainer = pl.Trainer( - devices=1, - enable_checkpointing=True, - logger=False, - max_steps=2, - strategy=_get_strategy(), - plugins=[sync_checkpoint_io], - default_root_dir=str(sync_ckpt_dir), - ) - sync_test_trainer.fit(model, data) - - ## reset the model and data and train with sync checkpointing - model, data = get_model_and_data() - async_test_trainer = pl.Trainer( - devices=1, - enable_checkpointing=True, - logger=False, - max_steps=2, - strategy=_get_strategy(), - plugins=[async_checkpoint_io], - callbacks=AsyncFinalizerCallback(), - default_root_dir=str(async_ckpt_dir), - ) - async_test_trainer.fit(model, data) + gbs, mbs = 2, 2 + model, data = get_model_and_data(mbs, gbs) + from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager + with reconfigure_num_microbatches_calculator_manager(0, + None, + gbs, + mbs, + data_parallel_size=1): + + sync_ckpt_dir = tmp_path / 'sync_checkpoints' + async_ckpt_dir = tmp_path / 'async_checkpoints' + + sync_checkpoint_io = MegatronCheckpointIO('torch_dist') + async_checkpoint_io = AsyncFinalizableCheckpointIO(MegatronCheckpointIO('torch_dist', async_save=True)) + + # dummy_trainer just to initialize NCCL + dummy_trainer = pl.Trainer( + devices=1, + logger=False, + max_steps=2, + strategy=_get_strategy(), + ) + dummy_trainer.fit(model, data) + strategy = _get_strategy() + tmp_path = strategy.broadcast(tmp_path) + + ## reset the model and data and train with sync checkpointing + model, data = get_model_and_data(mbs, gbs) + sync_test_trainer = pl.Trainer( + devices=1, + enable_checkpointing=True, + logger=False, + max_steps=2, + strategy=_get_strategy(), + plugins=[sync_checkpoint_io], + default_root_dir=str(sync_ckpt_dir), + ) + sync_test_trainer.fit(model, data) + + ## reset the model and data and train with sync checkpointing + model, data = get_model_and_data(mbs, gbs) + async_test_trainer = pl.Trainer( + devices=1, + enable_checkpointing=True, + logger=False, + max_steps=2, + strategy=_get_strategy(), + plugins=[async_checkpoint_io], + callbacks=AsyncFinalizerCallback(), + default_root_dir=str(async_ckpt_dir), + ) + async_test_trainer.fit(model, data) checkpoint = {'sharded_state_dict': model.sharded_state_dict()} diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py index 31ab88546cb3..8ec47900ba74 100644 --- a/tests/lightning/test_nemo_resume_from_ckpt.py +++ b/tests/lightning/test_nemo_resume_from_ckpt.py @@ -27,7 +27,6 @@ def set_env(): import pytest import torch -from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator from megatron.core.optimizer import OptimizerConfig import nemo.lightning as nl @@ -90,7 +89,7 @@ def compare_ckpts(a, b, path=[]): raise ValueError("Unexpected value type " + str(type(a))) -def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): +def setup_data(log_dir, n_steps, data_path, gbs=2, mbs=1): seq_length = 2048 tokenizer = get_nmt_tokenizer( "megatron", @@ -108,14 +107,10 @@ def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): tokenizer=tokenizer, split='9999,1,1', ) - # Other tests might have different configs, so need to configure explicitly. - reconfigure_num_microbatches_calculator( - 0, - None, - gbs, - mbs, - data_parallel_size=1, - ) + return data + +def setup_model_optim(log_dir, n_steps, tokenizer, gbs=2, mbs=1): + seq_length = 2048 gpt_config = llm.GPTConfig( num_layers=2, hidden_size=128, @@ -131,7 +126,7 @@ def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): masked_softmax_fusion=False, ) - model = llm.GPTModel(gpt_config, tokenizer=data.tokenizer) + model = llm.GPTModel(gpt_config, tokenizer=tokenizer) opt_config = OptimizerConfig( optimizer='adam', @@ -148,7 +143,7 @@ def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1): ) optim = MegatronOptimizerModule(config=opt_config) - return gpt_config, data, model, optim + return gpt_config, model, optim def setup_trainer_and_logger(log_dir): @@ -248,18 +243,28 @@ def train(n_steps, resume): log_dir = f'/tmp/mcore_logs_{n_steps}steps' os.makedirs(log_dir, exist_ok=True) data_path = [DATA_PATH] - gpt_config, data, model, optim = setup_data_model_optim(log_dir, n_steps, data_path) - trainer, nemo_logger = setup_trainer_and_logger(log_dir) - llm.train( - model=model, - data=data, - trainer=trainer, - log=nemo_logger, - resume=resume, - tokenizer='data', - optim=optim, - ) - trainer._teardown() + data = setup_data(log_dir, n_steps, data_path, gbs=2, mbs=1) + # Other tests might have different configs, so need to configure explicitly. + from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager + with reconfigure_num_microbatches_calculator_manager( + 0, + None, + 2, # gbs + 1, # mbs + data_parallel_size=1, + ): + gpt_config, model, optim = setup_model_optim(log_dir, n_steps, data.tokenizer) + trainer, nemo_logger = setup_trainer_and_logger(log_dir) + llm.train( + model=model, + data=data, + trainer=trainer, + log=nemo_logger, + resume=resume, + tokenizer='data', + optim=optim, + ) + trainer._teardown() set_env() assert os.environ['NVTE_FLASH_ATTN'] == '0' From 41685eccedc638e4ec7beadb1d906bb6cf7416b1 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 7 Oct 2024 00:08:51 -0700 Subject: [PATCH 4/7] add on_load_checkpoint test to ValidateModelRestoration; use ctx manager to reconfigure microbatch calculator; update save/restore path; add cleanup step at the end Signed-off-by: Alexandros Koumparoulis --- tests/lightning/test_state_restoration.py | 190 ++++++++++++++-------- 1 file changed, 121 insertions(+), 69 deletions(-) diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py index 2f4c60395725..327ccdf5f6b1 100644 --- a/tests/lightning/test_state_restoration.py +++ b/tests/lightning/test_state_restoration.py @@ -11,9 +11,10 @@ from nemo.collections.llm.api import train from nemo.collections.llm.gpt.data import PreTrainingDataModule from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.lightning import NeMoLogger +from nemo.lightning import NeMoLogger, AutoResume from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule +from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager VOCAB_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json" MERGES_PATH = "/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt" @@ -21,6 +22,11 @@ EXP_DIR = '/tmp/nemo_exp/' +def teardown(exp_dir=EXP_DIR): + import shutil + shutil.rmtree(exp_dir) + + class ValidateOptStateRestoration(Callback): def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: # PTL has no on_load_checkpoint_start event to be triggered before @@ -59,7 +65,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for p in pl_module.parameters(): - assert torch.all(p == 0), "Expected params to be zero" + assert torch.all(p == 0), "Expected params (scratch) to be zero" with torch.no_grad(): for p in pl_module.parameters(): p.fill_(random.uniform(0, 1)) @@ -69,14 +75,18 @@ class ValidateModelRestoration(Callback): def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for p in pl_module.parameters(): p.detach().zero_() + self.called_on_load_checkpoint = False + + def on_load_checkpoint(self, trainer, pl_module, checkpoint) -> None: + self.called_on_load_checkpoint = True def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: for p in pl_module.parameters(): - assert not torch.all(p == 0), "Expected params to be non-zero" - + assert not torch.all(p == 0), "Expected params (resume) to be non-zero" + assert hasattr(self, 'called_on_load_checkpoint') + assert self.called_on_load_checkpoint == True, "Expected to have called on_load_checkpoint" -def make_model_optim_data(): - seq_length = 2048 +def setup_data(mbs=1, gbs=2, seq_length=2048): tokenizer = get_nmt_tokenizer( "megatron", "GPT2BPETokenizer", @@ -87,16 +97,19 @@ def make_model_optim_data(): data = PreTrainingDataModule( paths=DATA_PATH, seq_length=2048, - micro_batch_size=1, - global_batch_size=2, + micro_batch_size=mbs, + global_batch_size=gbs, seed=1234, tokenizer=tokenizer, ) + return data + +def make_model_optim(tokenizer, mbs=1, gbs=2, seq_length=2048): gpt_config = llm.GPTConfig( - num_layers=12, - hidden_size=768, - ffn_hidden_size=3072, + num_layers=2, + hidden_size=128, + ffn_hidden_size=256, num_attention_heads=12, seq_length=seq_length, init_method_std=0.023, @@ -106,7 +119,7 @@ def make_model_optim_data(): make_vocab_size_divisible_by=128, masked_softmax_fusion=False, ) - model = llm.GPTModel(gpt_config, tokenizer=data.tokenizer) + model = llm.GPTModel(gpt_config, tokenizer=tokenizer) opt = MegatronOptimizerModule( config=OptimizerConfig( @@ -125,64 +138,103 @@ def make_model_optim_data(): ), ) - return model, opt, data - - -def run_train_from_scratch(): - model, opt, data = make_model_optim_data() - trainer = nl.Trainer( - devices=2, - max_steps=10, - accelerator="gpu", - strategy=nl.MegatronStrategy(), - callbacks=[ValidateOptStateScratchInit(), ValidateModelScratchInit()], - log_every_n_steps=1, - limit_val_batches=2, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - ) - - train( - model=model, - data=data, - trainer=trainer, - log=NeMoLogger( - log_dir=EXP_DIR, - ), - tokenizer='data', - optim=opt, - ) - - -def run_resume_train(): - model, opt, data = make_model_optim_data() - trainer = nl.Trainer( - devices=2, - max_steps=1, - accelerator="gpu", - strategy=nl.MegatronStrategy(), - callbacks=[ValidateOptStateRestoration(), ValidateModelRestoration()], - log_every_n_steps=1, - limit_val_batches=2, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - ) - - train( - model=model, - data=data, - trainer=trainer, - log=NeMoLogger( - log_dir=EXP_DIR, - ), - tokenizer='data', - optim=opt, - resume=nl.AutoResume( - resume_if_exists=True, - resume_ignore_no_checkpoint=True, - ), - ) + return model, opt + + +def run_train_from_scratch(mbs, gbs, num_dev): + data = setup_data(mbs, gbs) + model, opt = make_model_optim(data.tokenizer, mbs, gbs) + # Other tests might have different configs, so need to configure explicitly. + with reconfigure_num_microbatches_calculator_manager( + 0, + None, + gbs, + mbs, + data_parallel_size=num_dev, + ): + trainer = nl.Trainer( + devices=num_dev, + max_steps=10, + accelerator="gpu", + strategy=nl.MegatronStrategy(), + callbacks=[ValidateOptStateScratchInit(), ValidateModelScratchInit()], + log_every_n_steps=1, + limit_val_batches=2, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + + train( + model=model, + data=data, + trainer=trainer, + log=NeMoLogger( + log_dir=EXP_DIR, + version='v1', + use_datetime_version=True, + update_logger_directory=True, + wandb=None, + ), + resume=AutoResume( + resume_if_exists=True, + resume_ignore_no_checkpoint=True, + ), + tokenizer='data', + optim=opt, + ) + trainer._teardown() + + +def run_resume_train(mbs, gbs, num_dev): + data = setup_data(mbs, gbs) + model, opt = make_model_optim(data.tokenizer, mbs, gbs) + # Other tests might have different configs, so need to configure explicitly. + with reconfigure_num_microbatches_calculator_manager( + 0, + None, + gbs, + mbs, + data_parallel_size=num_dev, + ): + trainer = nl.Trainer( + devices=num_dev, + max_steps=1, + accelerator="gpu", + strategy=nl.MegatronStrategy(), + callbacks=[ValidateOptStateRestoration(), ValidateModelRestoration()], + log_every_n_steps=1, + limit_val_batches=2, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + from nemo.lightning.pytorch.strategies.utils import RestoreConfig + + train( + model=model, + data=data, + trainer=trainer, + tokenizer='data', + optim=opt, + log=NeMoLogger( + log_dir=EXP_DIR, + version='v1', + use_datetime_version=True, + update_logger_directory=True, + wandb=None, + ), + resume=AutoResume( + resume_if_exists=True, + resume_ignore_no_checkpoint=True, + resume_from_path=f'{EXP_DIR}default/v1/checkpoints/default--None=0.0000-epoch=0/', + ), + ) + trainer._teardown() @pytest.mark.run_only_on('GPU') def test_optim_state_restoration(): - run_train_from_scratch() - run_resume_train() + mbs, gbs = 1, 2 + num_devices = 1 + try: + run_train_from_scratch(mbs, gbs, num_devices) + run_resume_train(mbs, gbs, num_devices) + finally: + teardown() From 28812a9b57b52a06882dea86aecf606600b8e05f Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 7 Oct 2024 00:09:55 -0700 Subject: [PATCH 5/7] remove unused var Signed-off-by: Alexandros Koumparoulis --- tests/lightning/test_dist_ckpt.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py index 4884dd5b33a6..b0217729113c 100644 --- a/tests/lightning/test_dist_ckpt.py +++ b/tests/lightning/test_dist_ckpt.py @@ -125,7 +125,6 @@ def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path): ) dummy_trainer.fit(model, data) strategy = _get_strategy() - tmp_path = strategy.broadcast(tmp_path) ## reset the model and data and train with sync checkpointing model, data = get_model_and_data(mbs, gbs) From 422df96f07b22a68a4577128c4b3b3233334143d Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 7 Oct 2024 00:12:21 -0700 Subject: [PATCH 6/7] fix Signed-off-by: Alexandros Koumparoulis --- tests/lightning/test_state_restoration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py index 327ccdf5f6b1..e341a3493c7e 100644 --- a/tests/lightning/test_state_restoration.py +++ b/tests/lightning/test_state_restoration.py @@ -222,7 +222,7 @@ def run_resume_train(mbs, gbs, num_dev): ), resume=AutoResume( resume_if_exists=True, - resume_ignore_no_checkpoint=True, + resume_ignore_no_checkpoint=False, resume_from_path=f'{EXP_DIR}default/v1/checkpoints/default--None=0.0000-epoch=0/', ), ) From 02b776250d64741a65e4bc6376ab5d75207f3456 Mon Sep 17 00:00:00 2001 From: akoumpa Date: Mon, 7 Oct 2024 07:13:15 +0000 Subject: [PATCH 7/7] Apply isort and black reformatting Signed-off-by: akoumpa --- tests/lightning/mcore_microbatch_utils.py | 2 ++ tests/lightning/test_dist_ckpt.py | 18 +++++------------- tests/lightning/test_nemo_resume_from_ckpt.py | 6 ++++-- tests/lightning/test_state_restoration.py | 4 +++- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/lightning/mcore_microbatch_utils.py b/tests/lightning/mcore_microbatch_utils.py index 3f9080826b68..39b3baee446c 100644 --- a/tests/lightning/mcore_microbatch_utils.py +++ b/tests/lightning/mcore_microbatch_utils.py @@ -1,10 +1,12 @@ import contextlib + # @akoumparouli: use a context manager that saves/restores gbs/mbs when using # reconfigure_num_microbatches_calculator to avoid interference between tests. @contextlib.contextmanager def reconfigure_num_microbatches_calculator_manager(*args, **kwargs): import megatron.core.num_microbatches_calculator as mb_calc + # Store current mbs, gbs values if not mb_calc._GLOBAL_NUM_MICROBATCHES_CALCULATOR is None: _mbs = mb_calc.get_micro_batch_size() diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py index b0217729113c..d5037f0aa573 100644 --- a/tests/lightning/test_dist_ckpt.py +++ b/tests/lightning/test_dist_ckpt.py @@ -44,9 +44,7 @@ def _get_last_checkpoint_dir(model: pl.LightningModule, suffix: str = '') -> Pat def get_model_and_data(mbs=2, gbs=2): seq_length = 128 - data = llm.MockDataModule( - seq_length=seq_length, micro_batch_size=mbs, global_batch_size=gbs - ) + data = llm.MockDataModule(seq_length=seq_length, micro_batch_size=mbs, global_batch_size=gbs) config = llm.GPTConfig( num_layers=2, @@ -69,11 +67,8 @@ def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path): gbs, mbs = 2, 2 model, data = get_model_and_data(mbs, gbs) from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager - with reconfigure_num_microbatches_calculator_manager(0, - None, - gbs, - mbs, - data_parallel_size=1): + + with reconfigure_num_microbatches_calculator_manager(0, None, gbs, mbs, data_parallel_size=1): strategy = _get_strategy() @@ -104,11 +99,8 @@ def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path): gbs, mbs = 2, 2 model, data = get_model_and_data(mbs, gbs) from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager - with reconfigure_num_microbatches_calculator_manager(0, - None, - gbs, - mbs, - data_parallel_size=1): + + with reconfigure_num_microbatches_calculator_manager(0, None, gbs, mbs, data_parallel_size=1): sync_ckpt_dir = tmp_path / 'sync_checkpoints' async_ckpt_dir = tmp_path / 'async_checkpoints' diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py index 8ec47900ba74..e876e6965000 100644 --- a/tests/lightning/test_nemo_resume_from_ckpt.py +++ b/tests/lightning/test_nemo_resume_from_ckpt.py @@ -109,6 +109,7 @@ def setup_data(log_dir, n_steps, data_path, gbs=2, mbs=1): ) return data + def setup_model_optim(log_dir, n_steps, tokenizer, gbs=2, mbs=1): seq_length = 2048 gpt_config = llm.GPTConfig( @@ -246,11 +247,12 @@ def train(n_steps, resume): data = setup_data(log_dir, n_steps, data_path, gbs=2, mbs=1) # Other tests might have different configs, so need to configure explicitly. from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager + with reconfigure_num_microbatches_calculator_manager( 0, None, - 2, # gbs - 1, # mbs + 2, # gbs + 1, # mbs data_parallel_size=1, ): gpt_config, model, optim = setup_model_optim(log_dir, n_steps, data.tokenizer) diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py index e341a3493c7e..076a2f931f57 100644 --- a/tests/lightning/test_state_restoration.py +++ b/tests/lightning/test_state_restoration.py @@ -11,7 +11,7 @@ from nemo.collections.llm.api import train from nemo.collections.llm.gpt.data import PreTrainingDataModule from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.lightning import NeMoLogger, AutoResume +from nemo.lightning import AutoResume, NeMoLogger from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule from tests.lightning.mcore_microbatch_utils import reconfigure_num_microbatches_calculator_manager @@ -24,6 +24,7 @@ def teardown(exp_dir=EXP_DIR): import shutil + shutil.rmtree(exp_dir) @@ -86,6 +87,7 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") assert hasattr(self, 'called_on_load_checkpoint') assert self.called_on_load_checkpoint == True, "Expected to have called on_load_checkpoint" + def setup_data(mbs=1, gbs=2, seq_length=2048): tokenizer = get_nmt_tokenizer( "megatron",