Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mcore full TE transformer layer spec #8328

Merged
merged 26 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fce8799
Add spec and implement autocast layer
jbaczek Feb 5, 2024
86fb142
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 5, 2024
d2d7eb3
remove try-catchs, these dependecies are mandatory for this file
jbaczek Feb 9, 2024
1c49152
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 9, 2024
cbb0caa
Check out this cool try/except clause
jbaczek Feb 12, 2024
e948ed7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 12, 2024
c622399
Remove unused import
jbaczek Feb 12, 2024
828ebf8
Add import tests to Jenkinsfile
jbaczek Feb 13, 2024
b2ea671
Move import tests to Jenkins and remove code that is developed only f…
jbaczek Feb 13, 2024
6cff157
Make test robust to faulty base configs
jbaczek Feb 14, 2024
edd8ade
Use proper GPT implementation in the test
jbaczek Feb 14, 2024
46ae339
Update nemo/collections/nlp/models/language_modeling/megatron/gpt_ful…
jbaczek Feb 15, 2024
eea1fd3
Update nemo/collections/nlp/models/language_modeling/megatron/gpt_ful…
jbaczek Feb 15, 2024
b0bc935
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 15, 2024
4cb0fe4
Update nemo/collections/nlp/models/language_modeling/megatron/gpt_ful…
jbaczek Feb 19, 2024
e2352dd
Update nemo/collections/nlp/models/language_modeling/megatron/gpt_ful…
jbaczek Feb 19, 2024
29273b0
Add TE knobs to the copy of AutocastTransformerLayer
jbaczek Feb 19, 2024
7b7feef
Add TE knobs to the copy of AutocastTransformerLayer
jbaczek Feb 19, 2024
13d60df
Add dummy parameter to accomodated for the changes in mcore
jbaczek Feb 19, 2024
06cf98a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 19, 2024
f965ff1
Update mcore to 0.5.0 in Jenkins pipeline
jbaczek Feb 21, 2024
e828713
Bump mcore commit. This is commit from tot, not any release.
jbaczek Feb 21, 2024
c69c8f8
Remove from the test config option that is incompatible with bias_act…
jbaczek Feb 21, 2024
bf5080c
Bump TE version in CI to 1.4
jbaczek Feb 21, 2024
c333923
Update test
jbaczek Feb 22, 2024
b3d72f9
Change precision for the test - current runnens don't support bf16
jbaczek Feb 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 0 additions & 107 deletions .github/workflows/import-test.yml

This file was deleted.

95 changes: 93 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ pipeline {
steps {
sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
git fetch origin 8c9abbb80dba196f086b8b602a7cf1bce0040a6a && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
Expand All @@ -91,7 +91,7 @@ pipeline {
steps {
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
git checkout 5f9c870f9f24b482509699d206a9dbb00958f6fc && \
pip install .'
}
}
Expand All @@ -115,6 +115,13 @@ pipeline {
sh 'python -c "import nemo.collections.tts as nemo_tts"'
}
}
stage('Import Checks'){
steps {
sh 'python tests/core_ptl/check_imports.py --domain "asr"'
sh 'python tests/core_ptl/check_imports.py --domain "nlp"'
sh 'python tests/core_ptl/check_imports.py --domain "tts"'
}
}
stage('L0: Unit Tests GPU') {
steps {
sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads'
Expand Down Expand Up @@ -3470,6 +3477,90 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf examples/nlp/language_modeling/token_classification_results"
}
}
stage('L2: Megatron GPT Pretraining and Resume Training TETransformerLayerTP=2') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.name=megatron_gpt_full_te_layer_autocast \
model.mcore_gpt=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.normalization=layernorm1p \
model.bias_activation_fusion=True \
model.bias_dropout_add_fusion=True \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=null \
model.activations_checkpoint_granularity=null \
model.activations_checkpoint_num_layers=null \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.mcore_gpt=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.normalization=layernorm1p \
model.bias_activation_fusion=True \
model.bias_dropout_add_fusion=True \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=null \
model.activations_checkpoint_granularity=null \
model.activations_checkpoint_num_layers=null \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
}
}
// @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
when {
Expand Down
Loading
Loading