Parametrize FPS group #2891
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2020-2021, NVIDIA CORPORATION. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
name: "CICD NeMo" | |
on: | |
pull_request: | |
branches: | |
- 'main' | |
- 'r**' | |
types: [ labeled ] | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
cancel-in-progress: true | |
jobs: | |
gpu-test: | |
runs-on: self-hosted-azure | |
if: ${{ github.event.label.name == 'Run CICD' }} | |
steps: | |
- name: Run nvidia-smi test | |
run: | | |
whoami | |
nvidia-smi | |
cicd-cluster-clean: | |
runs-on: self-hosted-azure-builder | |
if: ${{ github.event.label.name == 'Run CICD' }} | |
steps: | |
- name: Clean server from old files | |
run: | | |
docker container prune --filter "until=24h" --force | |
docker image prune -a --filter "until=24h" --force | |
cicd-test-container-setup: | |
needs: [cicd-cluster-clean] | |
runs-on: self-hosted-azure-builder | |
if: ${{ github.event.label.name == 'Run CICD' }} | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
with: | |
path: ${{ github.run_id }} | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v3 | |
with: | |
# We use `docker` driver as this speeds things up for | |
# trivial (non-multi-stage) builds. | |
driver: docker | |
- name: Build and push | |
uses: docker/build-push-action@v5 | |
with: | |
file: Dockerfile.ci | |
push: true | |
cache-from: nemoci.azurecr.io/nemo_container:latest | |
cache-to: type=inline | |
tags: | | |
nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
nemoci.azurecr.io/nemo_container:latest | |
- name: Run some checks | |
run: | | |
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\ | |
# PyTorch Lightning version | |
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)" | |
# PyTorch Lightning DDP Checks | |
CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py" | |
# Basic Import Checks | |
python -c "import nemo.collections.asr as nemo_asr" | |
python -c "import nemo.collections.nlp as nemo_nlp" | |
python -c "import nemo.collections.tts as nemo_tts" | |
python setup.py style | |
python tests/check_copyright_header.py --dir . | |
# These checks are not crucial | |
exit 0 | |
' | |
### \'\' | |
L0_Unit_Tests_GPU: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
TIMEOUT: 60 | |
SCRIPT: | | |
NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads | |
IS_OPTIONAL: true | |
L0_Unit_Tests_CPU: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-cpu | |
TIMEOUT: 60 | |
SCRIPT: | | |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat | |
L0_Setup_Test_Data_And_Models: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python -m tests.setup --save_dir /home/TestData/nlp | |
## - name: L2: Multimodal Imagen Train | |
# L2: Community LLM Checkpoints tests | |
L2_Community_LLM_Checkpoints_tests_Llama: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ | |
--input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \ | |
--output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ | |
--precision=16 | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_llama/model_weights | |
L2_Community_LLM_Checkpoints_tests_Llama3: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ | |
--input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \ | |
--output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \ | |
--precision=16 | |
AFTER_SCRIPT: | | |
rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo | |
rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights | |
L2_Community_LLM_Checkpoints_tests_StarCoder: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}; | |
python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \ | |
--input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ | |
--output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }} | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; | |
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/ | |
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights | |
L2_Community_LLM_Checkpoints_tests_Falcon: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \ | |
--input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ | |
--output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo | |
rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights | |
# this test is using a 7B model which is too large for GitHub CI | |
# replace the model in this test with a toy model or move the test | |
# to the nightly CI | |
# OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2: | |
# needs: [cicd-test-container-setup] | |
# runs-on: self-hosted-azure | |
# container: | |
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
# options: | |
# # --user 0:128 | |
# --device=/dev/nvidia0 | |
# --gpus all | |
# --shm-size=8g | |
# --env TRANSFORMERS_OFFLINE=0 | |
# --env HYDRA_FULL_ERROR=1 | |
# --volume /mnt/datadrive/TestData:/home/TestData | |
# steps: | |
# - name: Checkout repository | |
# uses: actions/checkout@v4 | |
# - run: | | |
# python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \ | |
# --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \ | |
# --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo | |
# rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo | |
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
# if: "failure()" | |
L2_PTQ_Llama2_Export_Only: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_gpt_ptq.py \ | |
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ | |
quantization.algorithm=null \ | |
export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_llama/ci_baseline | |
L2_PTQ_Llama2_FP8: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_gpt_ptq.py \ | |
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ | |
model.tensor_model_parallel_size=2 \ | |
trainer.devices=2 \ | |
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ | |
quantization.algorithm=fp8 \ | |
quantization.num_calib_size=8 \ | |
inference.batch_size=2 \ | |
export.inference_tensor_parallel=2 \ | |
export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo | |
L2_PTQ_Llama2_INT8_SQ: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_gpt_ptq.py \ | |
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ | |
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ | |
quantization.algorithm=int8_sq \ | |
quantization.num_calib_size=8 \ | |
inference.batch_size=2 \ | |
export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo | |
# TODO: investigate int4_awq stuck issues and restore the test | |
#L2_PTQ_Llama2_INT4_AWQ: | |
# needs: [cicd-test-container-setup] | |
# runs-on: self-hosted-azure | |
# timeout-minutes: 10 | |
# container: | |
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
# options: | |
# # --user 0:128 | |
# --device=/dev/nvidia0 | |
# --gpus all | |
# --shm-size=8g | |
# --env TRANSFORMERS_OFFLINE=0 | |
# --env HYDRA_FULL_ERROR=1 | |
# --volume /mnt/datadrive/TestData:/home/TestData | |
# steps: | |
# - name: Checkout repository | |
# uses: actions/checkout@v4 | |
# - run: | | |
# python examples/nlp/language_modeling/megatron_gpt_ptq.py \ | |
# model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ | |
# model.tensor_model_parallel_size=1 \ | |
# trainer.devices=1 \ | |
# quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ | |
# quantization.algorithm=int4_awq \ | |
# quantization.num_calib_size=8 \ | |
# inference.batch_size=2 \ | |
# export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo | |
# | |
# rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo | |
#- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
# if: "failure()" | |
L2_QAT_Llama2_INT4: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \ | |
quantization.algorithm=int4 \ | |
quantization.num_calib_size=8 \ | |
trainer.devices=1 \ | |
trainer.num_nodes=1 \ | |
trainer.max_steps=4 \ | |
trainer.val_check_interval=4 \ | |
+trainer.limit_val_batches=2 \ | |
exp_manager.explicit_log_dir=llama2_qat_results \ | |
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ | |
model.tensor_model_parallel_size=1 \ | |
model.pipeline_model_parallel_size=1 \ | |
model.global_batch_size=2 \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.train_ds.concat_sampling_probabilities=[1.0] \ | |
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] | |
rm -rf llama2_qat_results | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
# L2: ASR dev run | |
ASR_dev_run_Speech_to_Text: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/asr_ctc/speech_to_text_ctc.py \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_results | |
ASR_dev_run_Speech_to_Text_WPE_-_CitriNet: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ | |
--config-path="../conf/citrinet/" --config-name="config_bpe" \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ | |
model.tokenizer.type="wpe" \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_wpe_results | |
ASR_dev_run_Speech_Pre-training_-_CitriNet: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/speech_pretraining/speech_pre_training.py \ | |
--config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_pre_training_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_pre_training_results | |
ASR_dev_run_Speech_To_Text_Finetuning: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/speech_to_text_finetune.py \ | |
--config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ | |
model.tokenizer.update_tokenizer=False \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_finetuning_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_finetuning_results | |
OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: |- | |
python examples/asr/speech_to_text_finetune.py \ | |
--config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ | |
~model.train_ds.hf_data_cfg \ | |
model.train_ds.num_workers=1 \ | |
model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ | |
model.train_ds.streaming=true \ | |
+model.train_ds.hf_data_cfg.path="librispeech_asr" \ | |
+model.train_ds.hf_data_cfg.name=null \ | |
+model.train_ds.hf_data_cfg.split="test.clean" \ | |
+model.train_ds.hf_data_cfg.streaming=true \ | |
~model.validation_ds.hf_data_cfg \ | |
model.validation_ds.streaming=true \ | |
+model.validation_ds.hf_data_cfg.path="librispeech_asr" \ | |
+model.validation_ds.hf_data_cfg.name=null \ | |
+model.validation_ds.hf_data_cfg.split="test.clean" \ | |
+model.validation_ds.hf_data_cfg.streaming=true \ | |
~model.test_ds \ | |
init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ | |
model.tokenizer.update_tokenizer=False \ | |
model.optim.sched.warmup_steps=0 \ | |
+model.optim.sched.max_steps=3 \ | |
trainer.max_epochs=null \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_finetuning_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_finetuning_results | |
IS_OPTIONAL: true | |
ASR_dev_run_Speech_to_Text_WPE_-_Conformer: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ | |
--config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ | |
model.tokenizer.type="wpe" \ | |
model.train_ds.batch_size=4 \ | |
model.validation_ds.batch_size=4 \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_wpe_conformer_results | |
# L2: ASR dev run - part two | |
ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ | |
--config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ | |
model.tokenizer.type="wpe" \ | |
model.encoder.d_model=144 \ | |
model.train_ds.batch_size=4 \ | |
model.validation_ds.batch_size=4 \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results | |
L2_Speech_to_Text_EMA: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/asr/asr_ctc/speech_to_text_ctc.py \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.devices=2 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
+exp_manager.ema.enable=True \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_results | |
# L2_Speech_to_Text_AED: | |
# needs: [cicd-test-container-setup] | |
# runs-on: self-hosted-azure-gpus-1 | |
# container: | |
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
# options: | |
# # --user 0:128 | |
# --device=/dev/nvidia0 | |
# --gpus all | |
# --shm-size=8g | |
# --env TRANSFORMERS_OFFLINE=0 | |
# --env HYDRA_FULL_ERROR=1 | |
# --volume /mnt/datadrive/TestData:/home/TestData | |
# steps: | |
# - name: Checkout repository | |
# uses: actions/checkout@v4 | |
# - run: | | |
# python examples/asr/speech_multitask/speech_to_text_aed.py \ | |
# model.prompt_format=canary \ | |
# model.model_defaults.asr_enc_hidden=256 \ | |
# model.model_defaults.lm_dec_hidden=256 \ | |
# model.encoder.n_layers=12 \ | |
# model.transf_encoder.num_layers=0 \ | |
# model.transf_decoder.config_dict.num_layers=12 \ | |
# model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \ | |
# ++model.train_ds.is_tarred=false \ | |
# model.train_ds.batch_duration=60 \ | |
# +model.train_ds.text_field="answer" \ | |
# +model.train_ds.lang_field="target_lang" \ | |
# model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ | |
# +model.validation_ds.text_field="answer" \ | |
# +model.validation_ds.lang_field="target_lang" \ | |
# model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ | |
# +model.test_ds.text_field="answer" \ | |
# +model.test_ds.lang_field="target_lang" \ | |
# model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \ | |
# model.tokenizer.langs.spl_tokens.type="bpe" \ | |
# model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \ | |
# model.tokenizer.langs.en.type=bpe \ | |
# ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \ | |
# ++model.tokenizer.langs.es.type=bpe \ | |
# trainer.devices=1 \ | |
# trainer.accelerator="gpu" \ | |
# +trainer.use_distributed_sampler=false \ | |
# +trainer.fast_dev_run=True \ | |
# exp_manager.exp_dir=examples/asr/speech_to_text_aed_results | |
# rm -rf examples/asr/speech_to_text_results | |
# L2: Speaker dev run | |
L2_Speaker_dev_run_Speaker_Recognition: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/speaker_tasks/recognition/speaker_reco.py \ | |
model.train_ds.batch_size=10 \ | |
model.validation_ds.batch_size=2 \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ | |
model.decoder.num_classes=2 \ | |
trainer.max_epochs=10 \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results | |
AFTER_SCRIPT: | | |
rm -rf examples/speaker_tasks/recognition/speaker_recognition_results | |
L2_Speaker_dev_run_Speaker_Diarization: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ | |
model.diarizer.speaker_embeddings.model_path=titanet_large \ | |
model.train_ds.batch_size=5 \ | |
model.validation_ds.batch_size=5 \ | |
model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ | |
model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results | |
AFTER_SCRIPT: | | |
rm -rf examples/speaker_tasks/diarization/speaker_diarization_results | |
L2_Speaker_dev_run_Speech_to_Label: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/speech_classification/speech_to_label.py \ | |
model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ | |
model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ | |
~model.preprocessor.window_size \ | |
~model.preprocessor.window_stride \ | |
~model.preprocessor.window \ | |
~model.preprocessor.n_mels \ | |
~model.preprocessor.n_mfcc \ | |
~model.preprocessor.n_fft \ | |
exp_manager.exp_dir=examples/asr/speech_to_label_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_label_results | |
L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ | |
diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ | |
diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ | |
diarizer.speaker_embeddings.parameters.save_embeddings=True \ | |
diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ | |
diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ | |
diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ | |
diarizer.asr.model_path=QuartzNet15x5Base-En \ | |
diarizer.asr.parameters.asr_based_vad=True \ | |
diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results | |
AFTER_SCRIPT: | | |
rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results | |
L2_Speaker_dev_run_Clustering_Diarizer_Inference: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ | |
diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ | |
diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ | |
diarizer.speaker_embeddings.parameters.save_embeddings=True \ | |
diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ | |
diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ | |
diarizer.speaker_embeddings.parameters.multiscale_weights=null \ | |
diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ | |
diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results | |
AFTER_SCRIPT: | | |
rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results | |
L2_Speaker_dev_run_Neural_Diarizer_Inference: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ | |
diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ | |
diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ | |
diarizer.speaker_embeddings.parameters.save_embeddings=True \ | |
diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ | |
diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results | |
AFTER_SCRIPT: | | |
rm -rf examples/speaker_tasks/diarization/neural_diarizer_results | |
L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python tools/speech_data_simulator/multispeaker_simulator.py \ | |
--config-path=conf --config-name=data_simulator.yaml \ | |
data_simulator.random_seed=42 \ | |
data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ | |
data_simulator.outputs.output_dir=./test_simulator \ | |
data_simulator.session_config.num_sessions=2 \ | |
data_simulator.session_config.session_length=60 | |
AFTER_SCRIPT: | | |
rm -rf ./test_simulator | |
# L2: ASR Multi-dataloader dev run | |
L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/asr_ctc/speech_to_text_ctc.py \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
trainer.max_epochs=1 \ | |
trainer.max_steps=1 \ | |
+trainer.num_sanity_val_steps=1 \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_results | |
L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/speech_classification/speech_to_label.py \ | |
model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ | |
model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
trainer.max_epochs=1 \ | |
trainer.max_steps=1 \ | |
+trainer.num_sanity_val_steps=1 \ | |
model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ | |
~model.preprocessor.window_size \ | |
~model.preprocessor.window_stride \ | |
~model.preprocessor.window \ | |
~model.preprocessor.n_mels \ | |
~model.preprocessor.n_mfcc \ | |
~model.preprocessor.n_fft \ | |
exp_manager.exp_dir=examples/asr/speech_to_label_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_label_results | |
# L2: ASR Adapters | |
L2_ASR_Adapters_Linear_Adapters: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/asr_adapters/train_asr_adapter.py \ | |
model.pretrained_model="stt_en_conformer_ctc_small" \ | |
model.adapter.adapter_name="an4" \ | |
model.adapter.linear.in_features=176 \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.max_steps=5 \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_adapters_results | |
L2_ASR_Adapters_RelPos_MHA_Adapters: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/asr/asr_adapters/train_asr_adapter.py \ | |
model.pretrained_model="stt_en_conformer_ctc_small" \ | |
model.adapter.adapter_name="encoder:an4" \ | |
model.adapter.adapter_type="tiny_attn" \ | |
model.adapter.tiny_attn.n_feat=176 \ | |
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ | |
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.max_steps=5 \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=True \ | |
exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results | |
AFTER_SCRIPT: | | |
rm -rf examples/asr/speech_to_text_adapters_mha_results | |
# L2: Speech Transcription | |
L2_Speech_Transcription_Speech_to_Text_Transcribe: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/asr/transcribe_speech.py \ | |
pretrained_name="QuartzNet15x5Base-En" \ | |
audio_dir="/home/TestData/an4_transcribe/test_subset/" \ | |
output_filename="stt_test_res.json" \ | |
amp=true | |
AFTER_SCRIPT: | | |
rm -rf stt_test_res.json | |
# L2: Transducer alignment | |
L2_Transducer_alignment_Running_pytest: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 | |
# L2: Segmentation Tool | |
L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
cd tools/ctc_segmentation && \ | |
TIME=`date +"%Y-%m-%d-%T"` && \ | |
/bin/bash run_segmentation.sh \ | |
--MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ | |
--DATA_DIR=/home/TestData/ctc_segmentation/eng \ | |
--OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \ | |
--LANGUAGE=en \ | |
--USE_NEMO_NORMALIZATION="TRUE" && \ | |
python /home/TestData/ctc_segmentation/verify_alignment.py \ | |
-r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ | |
-g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt; | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} | |
L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
cd tools/ctc_segmentation && \ | |
TIME=`date +"%Y-%m-%d-%T"` && \ | |
/bin/bash run_segmentation.sh \ | |
--MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ | |
--DATA_DIR=/home/TestData/ctc_segmentation/ru \ | |
--OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \ | |
--LANGUAGE=ru \ | |
--ADDITIONAL_SPLIT_SYMBOLS=";" && \ | |
python /home/TestData/ctc_segmentation/verify_alignment.py \ | |
-r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ | |
-g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt; | |
rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} | |
# L2: G2P Models | |
L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
cd examples/tts/g2p && \ | |
TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ | |
python g2p_train_and_evaluate.py \ | |
train_manifest=/home/TestData/g2p/g2p.json \ | |
validation_manifest=/home/TestData/g2p/g2p.json \ | |
model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ | |
model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ | |
trainer.max_epochs=1 \ | |
model.max_source_len=64 \ | |
trainer.devices=1 \ | |
do_training=True \ | |
do_testing=True \ | |
exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ | |
+exp_manager.use_datetime_version=False\ | |
+exp_manager.version=test \ | |
--config-name=g2p_conformer_ctc && \ | |
python g2p_inference.py \ | |
pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ | |
manifest_filepath=/home/TestData/g2p/g2p.json \ | |
phoneme_field=text | |
# TODO: pleasefixme @redoctopus | |
# - name: ByT5G2P training, evaluation and inference | |
# run: | | |
# cd examples/tts/g2p && \ | |
# TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \ | |
# python g2p_train_and_evaluate.py \ | |
# train_manifest=/home/TestData/g2p/g2p.json \ | |
# validation_manifest=/home/TestData/g2p/g2p.json \ | |
# model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ | |
# trainer.max_epochs=1 \ | |
# model.max_source_len=64 \ | |
# trainer.devices=1 \ | |
# do_training=True \ | |
# do_testing=True \ | |
# exp_manager.exp_dir=${OUTPUT_DIR_T5} \ | |
# +exp_manager.use_datetime_version=False\ | |
# +exp_manager.version=test && \ | |
# python g2p_inference.py \ | |
# pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \ | |
# manifest_filepath=/home/TestData/g2p/g2p.json \ | |
# phoneme_field=text | |
# } | |
# } | |
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
# if: "failure()" | |
L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
cd examples/tts/g2p && \ | |
TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ | |
python g2p_heteronym_classification_train_and_evaluate.py \ | |
train_manifest=/home/TestData/g2p/manifest.json \ | |
validation_manifest=/home/TestData/g2p/manifest.json \ | |
test_manifest=/home/TestData/g2p/manifest.json \ | |
model.wordids=/home/TestData/g2p/wordids.tsv \ | |
trainer.max_epochs=1 \ | |
model.max_seq_length=64 \ | |
do_training=True \ | |
do_testing=True \ | |
exp_manager.exp_dir=${OUTPUT_DIR} \ | |
+exp_manager.use_datetime_version=False\ | |
+exp_manager.version=test && \ | |
python g2p_heteronym_classification_inference.py \ | |
manifest=/home/TestData/g2p/manifest.json \ | |
pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ | |
output_manifest=preds.json | |
# L2: Duplex Text Normalization | |
L2_Duplex_Text_Normalization_with_Tarred_dataset: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
cd examples/nlp/duplex_text_normalization && \ | |
python duplex_text_normalization_train.py \ | |
data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ | |
mode=tn \ | |
lang=en \ | |
tagger_model.do_training=false \ | |
decoder_model.transformer=t5-small \ | |
data.validation_ds.batch_size=2 \ | |
data.train_ds.use_cache=false \ | |
data.validation_ds.use_cache=false \ | |
data.test_ds.batch_size=2 \ | |
data.train_ds.decoder_data_augmentation=false \ | |
data.train_ds.num_workers=2 \ | |
decoder_trainer.devices=[0,1] \ | |
decoder_trainer.accelerator="gpu" \ | |
data.train_ds.use_tarred_dataset=true \ | |
+decoder_trainer.fast_dev_run=true \ | |
decoder_exp_manager.create_checkpoint_callback=false \ | |
data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ | |
data.test_ds.use_cache=false \ | |
data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv | |
# L2: Intent and Slot Classification Tasks | |
L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/intent_slot_classification && \ | |
python intent_slot_classification.py \ | |
model.data_dir=/home/TestData/nlp/retail \ | |
model.validation_ds.prefix=dev \ | |
model.test_ds.prefix=dev \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=true \ | |
exp_manager.exp_dir=checkpoints | |
AFTER_SCRIPT: | | |
rm -rf checkpoints | |
L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/intent_slot_classification && \ | |
python multi_label_intent_slot_classification.py \ | |
model.data_dir=/home/TestData/nlp/new_multiatis \ | |
model.validation_ds.prefix=dev \ | |
model.test_ds.prefix=dev \ | |
trainer.devices=1 \ | |
+trainer.fast_dev_run=true \ | |
exp_manager.exp_dir=checkpoints2 | |
AFTER_SCRIPT: | | |
rm -rf checkpoints2 | |
# TODO: add when megatron-bert is supported again | |
# stage('L2: Model Parallel Size 2 Megatron Text Classification') { | |
# when { | |
# anyOf{ | |
# branch 'main' | |
# changeRequest target: 'main' | |
# } | |
# } | |
# failFast true | |
# steps{ | |
# cd examples/nlp/text_classification && \ | |
# python text_classification_with_bert.py \ | |
# trainer.devices=[0,1] \ | |
# trainer.accelerator="gpu" \ | |
# trainer.num_nodes=1 \ | |
# trainer.precision=16 \ | |
# trainer.gradient_clip_val=1.0 \ | |
# +trainer.fast_dev_run=true \ | |
# model.dataset.num_classes=6 \ | |
# model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ | |
# model.train_ds.batch_size=4 \ | |
# model.language_model.pretrained_model_name=megatron-bert-uncased \ | |
# model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ | |
# model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ | |
# model.nemo_path=null \ | |
# ~model.infer_samples \ | |
# exp_manager=null | |
# } | |
# } | |
# stage('L2: Model Parallel Size 2 Megatron Autoresume') { | |
# when { | |
# anyOf{ | |
# branch 'main' | |
# changeRequest target: 'main' | |
# } | |
# } | |
# failFast true | |
# steps{ | |
# cd examples/nlp/text_classification && \ | |
# python text_classification_with_bert.py \ | |
# trainer.devices=[0,1] \ | |
# trainer.accelerator="gpu" \ | |
# trainer.num_nodes=1 \ | |
# trainer.precision=16 \ | |
# trainer.gradient_clip_val=1.0 \ | |
# trainer.max_epochs=1 \ | |
# +trainer.fast_dev_run=true \ | |
# model.dataset.num_classes=6 \ | |
# model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ | |
# model.train_ds.batch_size=4 \ | |
# model.language_model.pretrained_model_name=megatron-bert-uncased \ | |
# model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ | |
# model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ | |
# model.nemo_path=null \ | |
# ~model.infer_samples \ | |
# +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \ | |
# +exp_manager.resume_if_exists=true | |
# } | |
# } | |
# stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') { | |
# when { | |
# anyOf{ | |
# branch 'main' | |
# changeRequest target: 'main' | |
# } | |
# } | |
# failFast true | |
# steps{ | |
# cd examples/nlp/text_classification && \ | |
# python model_parallel_text_classification_evaluation.py \ | |
# trainer.devices=[0,1] \ | |
# trainer.accelerator="gpu" \ | |
# trainer.num_nodes=1 \ | |
# model.dataset.num_classes=6 \ | |
# model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ | |
# model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \ | |
# exp_manager=null | |
# } | |
# } | |
# stage('L2: Model Parallel Size 2 Megatron Train from .nemo') { | |
# when { | |
# anyOf{ | |
# branch 'main' | |
# changeRequest target: 'main' | |
# } | |
# } | |
# failFast true | |
# steps{ | |
# cd examples/nlp/token_classification && \ | |
# python token_classification_train.py \ | |
# pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \ | |
# model.dataset.data_dir=/home/TestData/nlp/ner/ \ | |
# model.train_ds.batch_size=2 \ | |
# model.dataset.use_cache=false \ | |
# trainer.devices=[0,1] \ | |
# trainer.accelerator="gpu" \ | |
# +trainer.fast_dev_run=true \ | |
# model.dataset.class_balancing="weighted_loss" \ | |
# exp_manager=null | |
# } | |
# } | |
# L2: Parallel NLP Examples 2 | |
L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/token_classification && \ | |
python token_classification_train.py \ | |
pretrained_model=ner_en_bert \ | |
model.dataset.data_dir=/home/TestData/nlp/ner/ \ | |
model.train_ds.batch_size=2 \ | |
model.dataset.use_cache=false \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=true \ | |
model.dataset.class_balancing="weighted_loss" \ | |
exp_manager.exp_dir=null | |
L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/token_classification && \ | |
data_dir="$(mktemp -d -p "$(pwd)")" && \ | |
cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ | |
python punctuation_capitalization_train_evaluate.py \ | |
pretrained_model=punctuation_en_bert \ | |
model.train_ds.ds_item="${data_dir}" \ | |
model.validation_ds.ds_item="${data_dir}" \ | |
model.test_ds.ds_item="${data_dir}" \ | |
+model.train_ds.use_cache=false \ | |
+model.validation_ds.use_cache=false \ | |
+model.test_ds.use_cache=false \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=true \ | |
exp_manager.exp_dir=null; | |
rm -rf "${data_dir}" | |
L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/token_classification && \ | |
python token_classification_train.py \ | |
model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=true \ | |
model.dataset.use_cache=false \ | |
model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ | |
exp_manager.exp_dir=null | |
L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/token_classification/token_classification_evaluate.py \ | |
model.dataset.data_dir=/home/TestData/nlp/ner/ \ | |
model.dataset.use_cache=false \ | |
pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo | |
L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
data_dir="$(mktemp -d -p "$(pwd)")" && \ | |
cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ | |
python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ | |
+do_training=false \ | |
+do_testing=true \ | |
model.test_ds.ds_item="${data_dir}" \ | |
~model.train_ds \ | |
~model.validation_ds \ | |
+model.test_ds.use_cache=false \ | |
pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; | |
rm -rf "${data_dir}" | |
# L2: Parallel Pretraining BERT pretraining from Text/Preprocessed | |
L2_Pretraining_BERT_pretraining_from_Text: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/language_modeling && \ | |
python bert_pretraining.py \ | |
--config-name=bert_pretraining_from_text_config.yaml \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
trainer.precision=16 \ | |
+trainer.fast_dev_run=true \ | |
model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ | |
model.train_ds.batch_size=32 \ | |
model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ | |
model.validation_ds.batch_size=32 \ | |
model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ | |
model.optim.lr=0.01 \ | |
model.optim.sched.warmup_ratio=0.1 \ | |
model.tokenizer.tokenizer_name=sentencepiece \ | |
model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ | |
model.mask_prob=0.15 \ | |
model.short_seq_prob=0.1 \ | |
exp_manager.exp_dir=PretrainingBERTFromText; | |
AFTER_SCRIPT: | | |
rm -f /home/TestData/nlp/wikitext-2/*.pkl | |
#rm -rf examples/nlp/language_modeling/PretrainingBERTFromText | |
L2_Pretraining_BERT_from_Preprocessed: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/language_modeling && \ | |
python bert_pretraining.py \ | |
--config-name=bert_pretraining_from_preprocessed_config.yaml \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
trainer.precision=16 \ | |
+trainer.fast_dev_run=false \ | |
+trainer.max_epochs=1 \ | |
+trainer.limit_val_batches=0 \ | |
+trainer.limit_train_batches=1 \ | |
model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ | |
model.train_ds.batch_size=8 \ | |
model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ | |
model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ | |
model.optim.lr=0.875e-4 \ | |
model.optim.weight_decay=0.01 \ | |
model.optim.sched.warmup_ratio=0.01 \ | |
exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ | |
exp_manager.create_checkpoint_callback=False \ | |
#rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed | |
# TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 | |
# is in the release container | |
# L2: NMT Attention is All You Need Training | |
L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/nlp/machine_translation/enc_dec_nmt.py \ | |
--config-path=conf \ | |
--config-name=aayn_base \ | |
do_testing=false \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
model.encoder.num_layers=1 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.inner_size=256 \ | |
model.decoder.num_layers=1 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.inner_size=256 \ | |
+model.optim.capturable=True \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.val_check_interval=2 \ | |
+trainer.limit_val_batches=1 \ | |
+trainer.max_steps=2 \ | |
trainer.precision=16 \ | |
+exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ | |
+exp_manager.create_checkpoint_callback=true | |
python examples/nlp/machine_translation/enc_dec_nmt.py \ | |
--config-path=conf \ | |
--config-name=aayn_base \ | |
do_testing=true \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
model.encoder.num_layers=1 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.inner_size=256 \ | |
model.decoder.num_layers=1 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.inner_size=256 \ | |
+model.optim.capturable=True \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.val_check_interval=10 \ | |
+trainer.limit_val_batches=1 \ | |
+trainer.limit_test_batches=1 \ | |
+trainer.max_steps=10 \ | |
+exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ | |
+exp_manager.create_checkpoint_callback=true \ | |
+exp_manager.resume_if_exists=True | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/machine_translation/nmt_results | |
L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/machine_translation && \ | |
python enc_dec_nmt.py \ | |
--config-path=conf \ | |
--config-name=aayn_base \ | |
do_testing=true \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
model.encoder.pre_ln=true \ | |
model.decoder.pre_ln=true \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=true \ | |
+trainer.limit_test_batches=2 \ | |
exp_manager=null | |
L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/machine_translation && \ | |
python enc_dec_nmt.py \ | |
--config-path=conf \ | |
--config-name=aayn_base \ | |
do_testing=true \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ | |
model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ | |
model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ | |
model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ | |
model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ | |
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=true \ | |
+trainer.limit_test_batches=2 \ | |
exp_manager=null | |
# L2: NMT Attention is All You Need Inference | |
L2_NMT_Attention_is_All_You_Need_Inference: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
cd examples/nlp/machine_translation && \ | |
python nmt_transformer_infer.py \ | |
--model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ | |
--srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ | |
--tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ | |
--target_lang en \ | |
--source_lang de | |
# L2: NMT Attention is All You Need Finetuning | |
L2_NMT_Attention_is_All_You_Need_Finetuning: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/machine_translation && \ | |
python enc_dec_nmt_finetune.py \ | |
model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ | |
trainer.devices=1 \ | |
~trainer.max_epochs \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
+trainer.val_check_interval=10 \ | |
+trainer.limit_val_batches=1 \ | |
+trainer.limit_test_batches=1 \ | |
+trainer.max_steps=10 \ | |
+exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ | |
+exp_manager.create_checkpoint_callback=True \ | |
+exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ | |
+exp_manager.checkpoint_callback_params.mode=max \ | |
+exp_manager.checkpoint_callback_params.save_best_model=true | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/machine_translation/nmt_finetune | |
# L2: NMT Tarred Dataset Creation | |
L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
cd examples/nlp/machine_translation && \ | |
python enc_dec_nmt.py \ | |
--config-path=conf \ | |
--config-name=aayn_base \ | |
do_training=false \ | |
model.preproc_out_dir=$PWD/preproc_out_dir \ | |
model.train_ds.use_tarred_dataset=true \ | |
model.train_ds.n_preproc_jobs=2 \ | |
model.train_ds.lines_per_dataset_fragment=500 \ | |
model.train_ds.num_batches_per_tarfile=10 \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.encoder_tokenizer.vocab_size=2000 \ | |
model.decoder_tokenizer.vocab_size=2000 \ | |
~model.test_ds \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.fast_dev_run=true \ | |
exp_manager=null | |
L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
cd examples/nlp/machine_translation && \ | |
python create_tarred_parallel_dataset.py \ | |
--src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
--tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
--out_dir $PWD/out_dir \ | |
--encoder_tokenizer_vocab_size=2000 \ | |
--decoder_tokenizer_vocab_size=2000 \ | |
--tokens_in_batch=1000 \ | |
--lines_per_dataset_fragment=500 \ | |
--num_batches_per_tarfile=10 \ | |
--n_preproc_jobs=2 | |
L2_Megatron_NMT_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/machine_translation/megatron_nmt_training.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
+trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation='swiglu' \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method='block' \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation='swiglu' \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method='block' \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.micro_batch_size=2 \ | |
model.global_batch_size=4 \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.train_ds.num_workers=1 \ | |
model.validation_ds.num_workers=1 \ | |
~model.test_ds \ | |
model.train_ds.dataset_type=text_memmap \ | |
model.encoder_tokenizer.library=sentencepiece \ | |
model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ | |
model.decoder_tokenizer.library=sentencepiece \ | |
model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model | |
# Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error | |
# if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() | |
python examples/nlp/machine_translation/megatron_nmt_training.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
+trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation='swiglu' \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method='block' \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation='swiglu' \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method='block' \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.micro_batch_size=2 \ | |
model.global_batch_size=4 \ | |
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
model.train_ds.num_workers=1 \ | |
model.validation_ds.num_workers=1 \ | |
~model.test_ds \ | |
model.train_ds.dataset_type=text_memmap \ | |
model.encoder_tokenizer.library=sentencepiece \ | |
model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ | |
model.decoder_tokenizer.library=sentencepiece \ | |
model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/machine_translation/megatron_nmt_results | |
L2_Megatron_BART_Perceiver_MIM_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.arch=perceiver \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation='swiglu' \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method='block' \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation='swiglu' \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method='block' \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.micro_batch_size=2 \ | |
model.global_batch_size=4 \ | |
model.data.data_impl=text_mmap \ | |
model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ | |
model.data.splits_string='"800,100,100"' \ | |
model.data.whole_word_masking=False \ | |
model.tokenizer.library=sentencepiece \ | |
model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ | |
++model.hiddens.enc_output_name=z \ | |
++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ | |
++model.hiddens.transform.q_z_given_x.hidden_size=64 \ | |
++model.hiddens.loss.mim.cls_name=a_mim \ | |
++model.hiddens.loss.mim.loss_weight=0.5 | |
# Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error | |
# if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() | |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.arch=perceiver \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation='swiglu' \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method='block' \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation='swiglu' \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method='block' \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.micro_batch_size=2 \ | |
model.global_batch_size=4 \ | |
model.data.data_impl=text_mmap \ | |
model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ | |
model.data.splits_string='"800,100,100"' \ | |
model.data.whole_word_masking=False \ | |
model.tokenizer.library=sentencepiece \ | |
model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ | |
++model.hiddens.enc_output_name=z \ | |
++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ | |
++model.hiddens.transform.q_z_given_x.hidden_size=64 \ | |
++model.hiddens.loss.mim.cls_name=a_mim \ | |
++model.hiddens.loss.mim.loss_weight=0.5 | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/megatron_mim_results | |
# stage('L2: NMT Bottleneck Fallback') { | |
# when { | |
# anyOf { | |
# branch 'main' | |
# changeRequest target: 'main' | |
# } | |
# } | |
# failFast true | |
# parallel { | |
# stage('L2: seq2seq (no bottleneck)') { | |
# steps { | |
# cd examples/nlp/machine_translation && \ | |
# enc_dec_nmt-bottleneck.py \ | |
# --config-path=conf \ | |
# --config-name=aayn_bottleneck \ | |
# do_testing=true \ | |
# model.model_type=nll \ | |
# model.encoder.arch=seq2seq \ | |
# model.encoder.hidden_steps=1 \ | |
# model.encoder.hidden_blocks=1 \ | |
# model.encoder.hidden_init_method=params \ | |
# model.encoder.hidden_size=64 \ | |
# model.encoder.inner_size=128 \ | |
# model.encoder.num_attention_heads=2 \ | |
# model.encoder.num_layers=2 \ | |
# model.decoder.hidden_size=64 \ | |
# model.decoder.inner_size=128 \ | |
# model.decoder.num_attention_heads=2 \ | |
# model.decoder.num_layers=2 \ | |
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ | |
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ | |
# model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ | |
# model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ | |
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \ | |
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \ | |
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# trainer.devices=1 \ | |
# trainer.accelerator="gpu" \ | |
# +trainer.fast_dev_run=true \ | |
# +trainer.limit_test_batches=2 \ | |
# exp_manager=null \ | |
# } | |
# } | |
# } | |
# } | |
# stage('L2: NMT Bottleneck Architecture') { | |
# when { | |
# anyOf { | |
# branch 'main' | |
# changeRequest target: 'main' | |
# } | |
# } | |
# failFast true | |
# parallel { | |
# stage('Bridge Encoder (identity)') { | |
# steps { | |
# cd examples/nlp/machine_translation && \ | |
# enc_dec_nmt-bottleneck.py \ | |
# --config-path=conf \ | |
# --config-name=aayn_bottleneck \ | |
# do_testing=true \ | |
# model.model_type=nll \ | |
# model.encoder.arch=bridge \ | |
# model.encoder.hidden_steps=1 \ | |
# model.encoder.hidden_blocks=1 \ | |
# model.encoder.hidden_init_method=identity \ | |
# model.encoder.hidden_size=64 \ | |
# model.encoder.inner_size=128 \ | |
# model.encoder.num_attention_heads=2 \ | |
# model.encoder.num_layers=2 \ | |
# model.decoder.hidden_size=64 \ | |
# model.decoder.inner_size=128 \ | |
# model.decoder.num_attention_heads=2 \ | |
# model.decoder.num_layers=2 \ | |
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# trainer.devices=1 \ | |
# trainer.accelerator="gpu" \ | |
# +trainer.fast_dev_run=true \ | |
# +trainer.limit_test_batches=2 \ | |
# exp_manager=null | |
# } | |
# } | |
# stage('Perceiver Encoder (params)') { | |
# steps { | |
# cd examples/nlp/machine_translation && \ | |
# enc_dec_nmt-bottleneck.py \ | |
# --config-path=conf \ | |
# --config-name=aayn_bottleneck \ | |
# do_testing=true \ | |
# model.model_type=nll \ | |
# model.encoder.arch=perceiver \ | |
# model.encoder.hidden_steps=1 \ | |
# model.encoder.hidden_blocks=1 \ | |
# model.encoder.hidden_init_method=params \ | |
# model.encoder.hidden_size=64 \ | |
# model.encoder.inner_size=128 \ | |
# model.encoder.num_attention_heads=2 \ | |
# model.encoder.num_layers=2 \ | |
# model.decoder.hidden_size=64 \ | |
# model.decoder.inner_size=128 \ | |
# model.decoder.num_attention_heads=2 \ | |
# model.decoder.num_layers=2 \ | |
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# trainer.devices=1 \ | |
# trainer.accelerator="gpu" \ | |
# +trainer.fast_dev_run=true \ | |
# +trainer.limit_test_batches=2 \ | |
# exp_manager=null | |
# } | |
# } | |
# } | |
# } | |
# stage('L2: NMT Bottleneck LVM') { | |
# when { | |
# anyOf { | |
# branch 'main' | |
# changeRequest target: 'main' | |
# } | |
# } | |
# failFast true | |
# parallel { | |
# stage('VAE') { | |
# steps { | |
# cd examples/nlp/machine_translation && \ | |
# enc_dec_nmt-bottleneck.py \ | |
# --config-path=conf \ | |
# --config-name=aayn_bottleneck \ | |
# do_testing=true \ | |
# model.model_type=vae \ | |
# model.encoder.arch=perceiver \ | |
# model.encoder.hidden_steps=1 \ | |
# model.encoder.hidden_blocks=1 \ | |
# model.encoder.hidden_init_method=params \ | |
# model.encoder.hidden_size=64 \ | |
# model.encoder.inner_size=128 \ | |
# model.encoder.num_attention_heads=2 \ | |
# model.encoder.num_layers=2 \ | |
# model.decoder.hidden_size=64 \ | |
# model.decoder.inner_size=128 \ | |
# model.decoder.num_attention_heads=2 \ | |
# model.decoder.num_layers=2 \ | |
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# trainer.devices=1 \ | |
# trainer.accelerator="gpu" \ | |
# +trainer.fast_dev_run=true \ | |
# +trainer.limit_test_batches=2 \ | |
# exp_manager=null | |
# } | |
# } | |
# stage('MIM') { | |
# steps { | |
# cd examples/nlp/machine_translation && \ | |
# enc_dec_nmt-bottleneck.py \ | |
# --config-path=conf \ | |
# --config-name=aayn_bottleneck \ | |
# do_testing=true \ | |
# model.model_type=mim \ | |
# model.encoder.arch=perceiver \ | |
# model.encoder.hidden_steps=1 \ | |
# model.encoder.hidden_blocks=1 \ | |
# model.encoder.hidden_init_method=params \ | |
# model.encoder.hidden_size=64 \ | |
# model.encoder.inner_size=128 \ | |
# model.encoder.num_attention_heads=2 \ | |
# model.encoder.num_layers=2 \ | |
# model.decoder.hidden_size=64 \ | |
# model.decoder.inner_size=128 \ | |
# model.decoder.num_attention_heads=2 \ | |
# model.decoder.num_layers=2 \ | |
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ | |
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ | |
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ | |
# trainer.devices=1 \ | |
# trainer.accelerator="gpu" \ | |
# +trainer.fast_dev_run=true \ | |
# +trainer.limit_test_batches=2 \ | |
# exp_manager=null | |
# } | |
# } | |
# } | |
# } | |
L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=bf16 \ | |
model.megatron_amp_O2=True \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings | |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=20 \ | |
trainer.precision=bf16 \ | |
model.megatron_amp_O2=True \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings | |
L2_Megatron_Bert_Pretraining_and_Resume_Training: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=bf16 \ | |
model.megatron_amp_O2=True \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.sequence_parallel=True \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings | |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=20 \ | |
trainer.precision=bf16 \ | |
model.megatron_amp_O2=True \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings | |
rm -rf examples/nlp/language_modeling/bert_pretrain_results | |
rm -rf examples/nlp/language_modeling/bert_index_mappings | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ | |
model.mcore_bert=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.sequence_parallel=True \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method='block' \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings | |
NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=20 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.mcore_bert=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method='block' \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings | |
rm -rf examples/nlp/language_modeling/bert_pretrain_results | |
rm -rf examples/nlp/language_modeling/bert_index_mappings | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_Megatron_RETRO_Pretraining_and_Resume_Training: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_retro_pretraining.py \ | |
trainer.num_nodes=1 \ | |
trainer.devices=2 \ | |
trainer.precision=bf16 \ | |
trainer.accelerator=gpu \ | |
model.data.data_prefix=['none'] \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ | |
model.mcore_gpt=True \ | |
model.tensor_model_parallel_size=1 \ | |
model.pipeline_model_parallel_size=1 \ | |
model.optim.name=distributed_fused_adam \ | |
model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ | |
model.data.num_workers=4 \ | |
model.micro_batch_size=1 \ | |
model.data.shuffle_documents=False \ | |
trainer.val_check_interval=30 \ | |
+trainer.num_sanity_val_steps=0 \ | |
model.init_method_std=0.023 \ | |
model.optim.lr=6.0e-4 \ | |
model.megatron_amp_O2=True \ | |
model.data.splits_string=\'\"98,2,0\"\' \ | |
model.data.dataloader_type=cyclic \ | |
trainer.max_steps=10 | |
python examples/nlp/language_modeling/megatron_retro_pretraining.py \ | |
trainer.num_nodes=1 \ | |
trainer.devices=2 \ | |
trainer.precision=bf16 \ | |
trainer.accelerator=gpu \ | |
model.data.data_prefix=['none'] \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ | |
model.mcore_gpt=True \ | |
model.tensor_model_parallel_size=1 \ | |
model.pipeline_model_parallel_size=1 \ | |
model.optim.name=distributed_fused_adam \ | |
model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ | |
model.data.num_workers=4 \ | |
model.micro_batch_size=1 \ | |
model.data.shuffle_documents=False \ | |
trainer.val_check_interval=30 \ | |
+trainer.num_sanity_val_steps=0 \ | |
model.init_method_std=0.023 \ | |
model.optim.lr=6.0e-4 \ | |
model.megatron_amp_O2=True \ | |
model.data.splits_string=\'\"98,2,0\"\' \ | |
model.data.dataloader_type=cyclic \ | |
trainer.max_steps=20 | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/mcore_retro_results | |
L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ | |
trainer.devices=2 \ | |
trainer.num_nodes=1 \ | |
trainer.accelerator=gpu \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.limit_val_batches=2 \ | |
exp_manager.resume_if_exists=True \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
trainer.val_check_interval=10 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ | |
model.data.data_prefix= \ | |
model.data.knn_index= \ | |
model.data.retrieval_prefix= \ | |
model.tensor_model_parallel_size=2 \ | |
model.micro_batch_size=4 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.chunk_size=32 \ | |
model.enc_num_layers=2 \ | |
model.dec_num_layers=2 \ | |
model.enc_cross_attention=[1] \ | |
model.dec_cross_attention=[1] \ | |
+model.data.mock=True | |
python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ | |
trainer.devices=2 \ | |
trainer.num_nodes=1 \ | |
trainer.accelerator=gpu \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.limit_val_batches=2 \ | |
exp_manager.resume_if_exists=True \ | |
trainer.max_steps=20 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
trainer.val_check_interval=10 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ | |
model.data.data_prefix= \ | |
model.data.knn_index= \ | |
model.data.retrieval_prefix= \ | |
model.tensor_model_parallel_size=2 \ | |
model.micro_batch_size=4 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.chunk_size=32 \ | |
model.enc_num_layers=2 \ | |
model.dec_num_layers=2 \ | |
model.enc_cross_attention=[1] \ | |
model.dec_cross_attention=[1] \ | |
+model.data.mock=True | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/retro_legacy_results | |
# L2_Megatron_RETRO_muTransfer_Pretraining_Performance: | |
# needs: [cicd-test-container-setup] | |
# runs-on: self-hosted-azure | |
# container: | |
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
# options: | |
# # --user 0:128 | |
# --device=/dev/nvidia0 | |
# --gpus all | |
# --shm-size=8g | |
# --env TRANSFORMERS_OFFLINE=0 | |
# --env HYDRA_FULL_ERROR=1 | |
# --volume /mnt/datadrive/TestData:/home/TestData | |
# steps: | |
# - name: Checkout repository | |
# uses: actions/checkout@v4 | |
# - run: | | |
# python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ | |
# trainer.devices=2 \ | |
# trainer.num_nodes=1 \ | |
# trainer.accelerator=gpu \ | |
# trainer.accumulate_grad_batches=1 \ | |
# trainer.max_steps=100 \ | |
# trainer.log_every_n_steps=1 \ | |
# trainer.precision=16 \ | |
# trainer.val_check_interval=100 \ | |
# trainer.limit_val_batches=0 \ | |
# trainer.gradient_clip_val=1.0 \ | |
# +trainer.num_sanity_val_steps=0 \ | |
# exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \ | |
# +exp_manager.version=smalltest \ | |
# model.data.neighbors=2 \ | |
# model.megatron_amp_O2=False \ | |
# model.apply_query_key_layer_scaling=False \ | |
# model.tensor_model_parallel_size=1 \ | |
# model.optim.name=muadamw \ | |
# model.optim.weight_decay=0.1 \ | |
# model.optim.betas=[0.9,0.95] \ | |
# model.optim.lr=6e-4 \ | |
# model.optim.sched.warmup_steps=1000 \ | |
# model.optim.sched.constant_steps=0 \ | |
# model.optim.sched.min_lr=6e-5 \ | |
# model.add_position_embedding=False \ | |
# model.enc_num_layers=2 \ | |
# model.dec_num_layers=6 \ | |
# model.enc_cross_attention=[0] \ | |
# model.dec_cross_attention=[3,5] \ | |
# model.hidden_size=96 \ | |
# model.ffn_hidden_size=384 \ | |
# model.init_method_std=0.023 \ | |
# model.num_attention_heads=12 \ | |
# model.max_position_embeddings=1024 \ | |
# model.encoder_seq_length=1024 \ | |
# model.tokenizer.library=megatron \ | |
# model.tokenizer.type=GPT2BPETokenizer \ | |
# model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \ | |
# model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \ | |
# model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \ | |
# model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \ | |
# model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \ | |
# model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \ | |
# model.data.num_workers=8 \ | |
# model.micro_batch_size=8 \ | |
# model.normalization=rmsnorm \ | |
# model.transformer_block_type=pre_ln \ | |
# model.bias_activation_fusion=True \ | |
# model.bias_dropout_add_fusion=False \ | |
# model.masked_softmax_fusion=True \ | |
# model.hidden_dropout=0 \ | |
# model.attention_dropout=0 \ | |
# model.fp32_residual_connection=True \ | |
# model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml | |
# python -c "import pandas as pd | |
# import pathlib | |
# from pandas.testing import assert_frame_equal | |
# from tensorboard.backend.event_processing.event_accumulator import EventAccumulator | |
# import torch | |
# if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()): | |
# import sys | |
# sys.exit(0) | |
# event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] | |
# ea = EventAccumulator(str(event_file)).Reload() | |
# vals = [] | |
# for i in ea.Scalars('reduced_train_loss'): | |
# vals.append(i.value) | |
# training_curve = pd.DataFrame({'loss': vals}) | |
# gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv') | |
# assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)" | |
# rm -rf examples/nlp/language_modeling/retro_results | |
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
# if: "failure()" | |
L2_RAG_Pipeline_Indexing: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/rag/rag_indexing.py \ | |
trainer.num_nodes=1 \ | |
trainer.devices=1 \ | |
trainer.precision='bf16-mixed' \ | |
indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \ | |
indexing.embedder.embed_batch_size=128 \ | |
indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \ | |
indexing.data.chunk_size=256 \ | |
indexing.data.chunk_overlap=10 \ | |
indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_RAG_Pipeline_Generating: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/rag/rag_generating.py \ | |
trainer.devices=1 \ | |
trainer.precision='bf16-mixed' \ | |
indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \ | |
indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \ | |
generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \ | |
generating.inference.tokens_to_generate=50 \ | |
generating.inference.greedy=False \ | |
generating.inference.temperature=1.0 \ | |
generating.query='Which art schools did I applied to?' | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_BioMegatron_Bert_NER_Task: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/token_classification/token_classification_train.py \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ | |
trainer.max_epochs=1 \ | |
model.dataset.data_dir=/home/TestData/nlp/ner \ | |
model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ | |
model.tokenizer.tokenizer_name=null | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/token_classification_results | |
L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=3 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=1 \ | |
model.optim.sched.constant_steps=1 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.bias=False \ | |
model.bias_activation_fusion=False \ | |
model.bias_dropout_add_fusion=False \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_granularity=full \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.validation_drop_last=False \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=6 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.bias=False \ | |
model.bias_activation_fusion=False \ | |
model.bias_dropout_add_fusion=False \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_granularity=full \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.validation_drop_last=False \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
rm -rf examples/nlp/language_modeling/gpt_pretrain_results | |
rm -rf examples/nlp/language_modeling/gpt_index_mappings | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=3 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=1 \ | |
model.optim.sched.constant_steps=1 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.position_embedding_type=rope \ | |
model.rotary_percentage=0.5 \ | |
model.bias=False \ | |
model.bias_activation_fusion=False \ | |
model.bias_dropout_add_fusion=False \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_granularity=full \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
# commented out to save time on github ci @adithyare | |
# python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
# trainer.devices=2 \ | |
# trainer.accelerator=gpu \ | |
# trainer.log_every_n_steps=1 \ | |
# trainer.val_check_interval=2 \ | |
# trainer.limit_val_batches=1 \ | |
# trainer.accumulate_grad_batches=1 \ | |
# trainer.max_steps=6 \ | |
# trainer.gradient_clip_val=1.0 \ | |
# exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
# exp_manager.resume_if_exists=True \ | |
# model.tensor_model_parallel_size=2 \ | |
# model.optim.name=fused_adam \ | |
# model.optim.lr=2e-4 \ | |
# model.optim.sched.warmup_steps=2 \ | |
# model.optim.sched.constant_steps=2 \ | |
# model.optim.sched.min_lr=8e-5 \ | |
# model.max_position_embeddings=128 \ | |
# model.encoder_seq_length=128 \ | |
# model.data.seq_length=128 \ | |
# model.position_embedding_type=rope \ | |
# model.rotary_percentage=0.5 \ | |
# model.normalization=rmsnorm \ | |
# model.bias=False \ | |
# model.bias_activation_fusion=False \ | |
# model.bias_dropout_add_fusion=False \ | |
# model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
# model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
# model.num_layers=8 \ | |
# model.hidden_size=256 \ | |
# model.num_attention_heads=8 \ | |
# model.activations_checkpoint_method=block \ | |
# model.activations_checkpoint_granularity=full \ | |
# model.activations_checkpoint_num_layers=1 \ | |
# model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
# model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" | |
rm -rf examples/nlp/language_modeling/gpt_pretrain_results | |
rm -rf examples/nlp/language_modeling/gpt_index_mappings | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
# This test requires Ampere but some of the test GPUs are Volta | |
# Need to add a check for compute capability before uncommenting this test | |
# - name: L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2 | |
# when { | |
# anyOf { | |
# branch main | |
# changeRequest target: main | |
# } | |
# } | |
# failFast true | |
# - run: | | |
# python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
# trainer.devices=2 \ | |
# trainer.accelerator=gpu \ | |
# trainer.log_every_n_steps=1 \ | |
# trainer.val_check_interval=2 \ | |
# trainer.limit_val_batches=2 \ | |
# trainer.accumulate_grad_batches=1 \ | |
# trainer.max_steps=3 \ | |
# trainer.precision=16 \ | |
# trainer.gradient_clip_val=1.0 \ | |
# exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
# model.tensor_model_parallel_size=2 \ | |
# model.optim.name=fused_adam \ | |
# model.optim.lr=2e-4 \ | |
# model.optim.sched.warmup_steps=1 \ | |
# model.optim.sched.constant_steps=1 \ | |
# model.optim.sched.min_lr=8e-5 \ | |
# model.max_position_embeddings=128 \ | |
# model.encoder_seq_length=128 \ | |
# model.data.seq_length=128 \ | |
# model.position_embedding_type=rope \ | |
# model.rotary_percentage=0.5 \ | |
# model.normalization=rmsnorm \ | |
# model.bias=False \ | |
# model.bias_activation_fusion=False \ | |
# model.bias_dropout_add_fusion=False \ | |
# model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
# model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
# model.num_layers=8 \ | |
# model.hidden_size=256 \ | |
# model.num_attention_heads=8 \ | |
# model.activations_checkpoint_method=block \ | |
# model.activations_checkpoint_granularity=full \ | |
# model.activations_checkpoint_num_layers=1 \ | |
# model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
# model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ | |
# model.use_flash_attention=True " | |
# # commented out to save time on github ci @adithyare | |
# # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
# # trainer.devices=2 \ | |
# # trainer.accelerator=gpu \ | |
# # trainer.log_every_n_steps=1 \ | |
# # trainer.val_check_interval=2 \ | |
# # trainer.limit_val_batches=1 \ | |
# # trainer.accumulate_grad_batches=1 \ | |
# # trainer.max_steps=6 \ | |
# # trainer.precision=16 \ | |
# # trainer.gradient_clip_val=1.0 \ | |
# # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
# # exp_manager.resume_if_exists=True \ | |
# # model.tensor_model_parallel_size=2 \ | |
# # model.optim.name=fused_adam \ | |
# # model.optim.lr=2e-4 \ | |
# # model.optim.sched.warmup_steps=2 \ | |
# # model.optim.sched.constant_steps=2 \ | |
# # model.optim.sched.min_lr=8e-5 \ | |
# # model.max_position_embeddings=128 \ | |
# # model.encoder_seq_length=128 \ | |
# # model.data.seq_length=128 \ | |
# # model.position_embedding_type=rope \ | |
# # model.rotary_percentage=0.5 \ | |
# # model.normalization=rmsnorm \ | |
# # model.bias=False \ | |
# # model.bias_activation_fusion=False \ | |
# # model.bias_dropout_add_fusion=False \ | |
# # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
# # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
# # model.num_layers=8 \ | |
# # model.hidden_size=256 \ | |
# # model.num_attention_heads=8 \ | |
# # model.activations_checkpoint_method=block \ | |
# # model.activations_checkpoint_granularity=full \ | |
# # model.activations_checkpoint_num_layers=1 \ | |
# # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
# # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ | |
# # model.use_flash_attention=True" | |
# rm -rf examples/nlp/language_modeling/gpt_pretrain_results" | |
# rm -rf examples/nlp/language_modeling/gpt_index_mappings" | |
# } | |
# } | |
L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=3 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=3 \ | |
trainer.precision=bf16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.megatron_amp_O2=True \ | |
model.optim.name=distributed_fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=3 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=6 \ | |
trainer.precision=bf16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.reset_lr=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.megatron_amp_O2=True \ | |
model.optim.name=distributed_fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
rm -rf examples/nlp/language_modeling/gpt_pretrain_results | |
rm -rf examples/nlp/language_modeling/gpt_index_mappings | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=3 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=1 \ | |
model.optim.sched.constant_steps=1 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.position_embedding_type=alibi \ | |
model.bias=False \ | |
model.bias_activation_fusion=False \ | |
model.bias_dropout_add_fusion=False \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_granularity=full \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
# not testing resume functionality to save time on ci @adithyare | |
#python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
#trainer.devices=2 \ | |
#trainer.accelerator=gpu \ | |
#trainer.log_every_n_steps=1 \ | |
#trainer.val_check_interval=2 \ | |
#trainer.limit_val_batches=1 \ | |
#trainer.accumulate_grad_batches=1 \ | |
#trainer.max_steps=6 \ | |
#trainer.gradient_clip_val=1.0 \ | |
#exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
#exp_manager.resume_if_exists=True \ | |
#model.tensor_model_parallel_size=2 \ | |
#model.optim.name=fused_adam \ | |
#model.optim.lr=2e-4 \ | |
#model.optim.sched.warmup_steps=2 \ | |
#model.optim.sched.constant_steps=2 \ | |
#model.optim.sched.min_lr=8e-5 \ | |
#model.max_position_embeddings=128 \ | |
#model.encoder_seq_length=128 \ | |
#model.data.seq_length=128 \ | |
#model.position_embedding_type=alibi \ | |
#model.normalization=rmsnorm \ | |
#model.bias=False \ | |
#model.bias_activation_fusion=False \ | |
#model.bias_dropout_add_fusion=False \ | |
#model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
#model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
#model.num_layers=8 \ | |
#model.hidden_size=256 \ | |
#model.num_attention_heads=8 \ | |
#model.activations_checkpoint_method=block \ | |
#model.activations_checkpoint_granularity=full \ | |
#model.activations_checkpoint_num_layers=1 \ | |
#model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
#model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" | |
rm -rf examples/nlp/language_modeling/gpt_pretrain_results | |
rm -rf examples/nlp/language_modeling/gpt_index_mappings | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=3 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=1 \ | |
model.optim.sched.constant_steps=1 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.data.seq_length=128 \ | |
model.position_embedding_type=kerple \ | |
model.bias=False \ | |
model.bias_activation_fusion=False \ | |
model.bias_dropout_add_fusion=False \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_granularity=full \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
# commented out to save time on github ci @adithyare | |
#python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
#trainer.devices=2 \ | |
#trainer.accelerator=gpu \ | |
#trainer.log_every_n_steps=1 \ | |
#trainer.val_check_interval=2 \ | |
#trainer.limit_val_batches=1 \ | |
#trainer.accumulate_grad_batches=1 \ | |
#trainer.max_steps=6 \ | |
#trainer.precision=16 \ | |
#trainer.gradient_clip_val=1.0 \ | |
#exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
#exp_manager.resume_if_exists=True \ | |
#model.tensor_model_parallel_size=2 \ | |
#model.optim.name=fused_adam \ | |
#model.optim.lr=2e-4 \ | |
#model.optim.sched.warmup_steps=2 \ | |
#model.optim.sched.constant_steps=2 \ | |
#model.optim.sched.min_lr=8e-5 \ | |
#model.max_position_embeddings=128 \ | |
#model.encoder_seq_length=128 \ | |
#model.data.seq_length=128 \ | |
#model.position_embedding_type=kerple \ | |
#model.normalization=rmsnorm \ | |
#model.bias=False \ | |
#model.bias_activation_fusion=False \ | |
#model.bias_dropout_add_fusion=False \ | |
#model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
#model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
#model.num_layers=8 \ | |
#model.hidden_size=256 \ | |
#model.num_attention_heads=8 \ | |
#model.activations_checkpoint_method=block \ | |
#model.activations_checkpoint_granularity=full \ | |
#model.activations_checkpoint_num_layers=1 \ | |
#model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
#model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" | |
rm -rf examples/nlp/language_modeling/gpt_pretrain_results | |
rm -rf examples/nlp/language_modeling/gpt_index_mappings | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=3 \ | |
trainer.precision=bf16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
model.pipeline_model_parallel_size=2 \ | |
model.tensor_model_parallel_size=1 \ | |
model.mcore_gpt=True \ | |
model.megatron_amp_O2=True \ | |
model.optim.name=distributed_fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=1 \ | |
model.optim.sched.constant_steps=1 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.activation=fast-swiglu \ | |
model.bias_activation_fusion=False \ | |
model.hidden_dropout=0.0 \ | |
model.attention_dropout=0.0 \ | |
model.transformer_block_type=normformer \ | |
model.headscale=True \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=6 \ | |
trainer.precision=bf16 \ | |
trainer.gradient_clip_val=1.0 \ | |
model.mcore_gpt=True \ | |
model.megatron_amp_O2=True \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.pipeline_model_parallel_size=2 \ | |
model.tensor_model_parallel_size=1 \ | |
model.optim.name=distributed_fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.optim.sched.warmup_steps=2 \ | |
model.optim.sched.constant_steps=2 \ | |
model.optim.sched.min_lr=8e-5 \ | |
model.max_position_embeddings=128 \ | |
model.encoder_seq_length=128 \ | |
model.activation=fast-swiglu \ | |
model.bias_activation_fusion=False \ | |
model.hidden_dropout=0.0 \ | |
model.attention_dropout=0.0 \ | |
model.transformer_block_type=normformer \ | |
model.headscale=True \ | |
model.data.seq_length=128 \ | |
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ | |
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ | |
model.num_layers=8 \ | |
model.hidden_size=256 \ | |
model.num_attention_heads=8 \ | |
model.activations_checkpoint_method=block \ | |
model.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/gpt_pretrain_results | |
rm -rf examples/nlp/language_modeling/gpt_index_mappings | |
L2_Megatron_GPT_Finetuning_PP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
+trainer.limit_val_batches=2 \ | |
trainer.max_steps=3 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ | |
model.pipeline_model_parallel_size=2 \ | |
model.tensor_model_parallel_size=1 \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.peft.peft_scheme=null \ | |
model.data.train_ds.micro_batch_size=1 \ | |
model.data.train_ds.global_batch_size=4 \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ | |
model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ | |
model.data.train_ds.num_workers=0 \ | |
model.data.test_ds.micro_batch_size=1 \ | |
model.data.test_ds.global_batch_size=1 \ | |
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.test_ds.names=[quarel] \ | |
model.data.validation_ds.micro_batch_size=1 \ | |
model.data.validation_ds.global_batch_size=1 \ | |
model.data.validation_ds.num_workers=0 \ | |
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.validation_ds.names=[quarel] | |
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
+trainer.limit_val_batches=2 \ | |
trainer.max_steps=3 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ | |
model.pipeline_model_parallel_size=2 \ | |
model.tensor_model_parallel_size=1 \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ | |
model.optim.name=fused_adam \ | |
model.optim.lr=2e-4 \ | |
model.peft.peft_scheme=null \ | |
model.data.train_ds.micro_batch_size=1 \ | |
model.data.train_ds.global_batch_size=4 \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ | |
model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ | |
model.data.train_ds.num_workers=0 \ | |
model.data.test_ds.micro_batch_size=1 \ | |
model.data.test_ds.global_batch_size=1 \ | |
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.test_ds.names=[quarel] \ | |
model.data.validation_ds.micro_batch_size=1 \ | |
model.data.validation_ds.global_batch_size=1 \ | |
model.data.validation_ds.num_workers=0 \ | |
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.validation_ds.names=[quarel] | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/gpt_sft_results | |
L2_Megatron_GPT_Finetuning_StarCoder_PP1: | |
needs: [cicd-test-container-setup] | |
runs-on: self-hosted-azure-gpus-1 | |
timeout-minutes: 10 | |
container: | |
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
options: | |
# --user 0:128 | |
--device=/dev/nvidia0 | |
--gpus all | |
--shm-size=8g | |
--env TRANSFORMERS_OFFLINE=0 | |
--env HYDRA_FULL_ERROR=1 | |
--volume /mnt/datadrive/TestData:/home/TestData | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- run: | | |
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ | |
trainer.devices=1 \ | |
trainer.num_nodes=1 \ | |
trainer.precision=bf16 \ | |
trainer.max_steps=4 \ | |
trainer.val_check_interval=4 \ | |
trainer.enable_checkpointing=False \ | |
+trainer.limit_val_batches=2 \ | |
+trainer.limit_test_batches=2 \ | |
exp_manager.checkpoint_callback_params.save_best_model=False \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ | |
model.peft.peft_scheme=none \ | |
model.optim.name=distributed_fused_adam \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ | |
model.tensor_model_parallel_size=1 \ | |
model.pipeline_model_parallel_size=1 \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.train_ds.num_workers=0 \ | |
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.validation_ds.num_workers=0 \ | |
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.test_ds.num_workers=0 \ | |
model.data.train_ds.concat_sampling_probabilities=[1.0] | |
rm -rf examples/nlp/language_modeling/gpt_sft_results | |
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
if: "failure()" | |
L2_Megatron_GPT_Embedding: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_ir/working_dir | |
python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \ | |
exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ | |
model.global_batch_size=4 \ | |
model.micro_batch_size=4 \ | |
trainer.devices=1 \ | |
trainer.num_nodes=1 \ | |
trainer.max_epochs=null \ | |
trainer.max_steps=20 \ | |
trainer.val_check_interval=10 \ | |
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ | |
model.peft.lora_tuning.adapter_dim=8 \ | |
model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ | |
model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \ | |
model.data.validation_ds.write_embeddings_to_file=True \ | |
model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] | |
python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \ | |
trainer.devices=1 \ | |
trainer.num_nodes=1 \ | |
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ | |
model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \ | |
model.global_batch_size=4 \ | |
model.micro_batch_size=4 \ | |
model.peft.lora_tuning.adapter_dim=8 \ | |
model.data.test_ds.write_embeddings_to_file=True \ | |
model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \ | |
model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ | |
model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/megatron_ir/working_dir | |
L2_Megatron_GPT_PEFT_Lora_PP2_O2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
rm -rf /home/TestData/nlp/lora_tuning_pp2 | |
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.max_epochs=9999 \ | |
trainer.max_steps=3 \ | |
trainer.val_check_interval=3 \ | |
++trainer.limit_val_batches=2 \ | |
trainer.precision=bf16 \ | |
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \ | |
model.pipeline_model_parallel_size=2 \ | |
model.tensor_model_parallel_size=1 \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ | |
model.megatron_amp_O2=True \ | |
model.peft.peft_scheme=lora \ | |
model.answer_only_loss=True \ | |
model.micro_batch_size=1 \ | |
model.global_batch_size=1 \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.train_ds.concat_sampling_probabilities=[1.0] \ | |
model.data.train_ds.num_workers=0 \ | |
model.data.validation_ds.num_workers=0 \ | |
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.validation_ds.names=[quarel] | |
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ | |
model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ | |
model.pipeline_model_parallel_size=2 \ | |
model.tensor_model_parallel_size=1 \ | |
trainer.devices=2 \ | |
model.megatron_amp_O2=True \ | |
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ | |
model.data.test_ds.names=['quarel4'] \ | |
model.global_batch_size=2 \ | |
model.micro_batch_size=1 \ | |
model.data.test_ds.tokens_to_generate=10 \ | |
model.data.test_ds.write_predictions_to_file=True \ | |
model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \ | |
inference.greedy=True \ | |
inference.repetition_penalty=1.0 \ | |
inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl' | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/lora_tuning_pp2 | |
L2_Megatron_GPT_PEFT_Lora_TP2_O1: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
rm -rf /home/TestData/nlp/lora_tuning_tp2 | |
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.max_epochs=9999 \ | |
trainer.max_steps=3 \ | |
trainer.val_check_interval=3 \ | |
++trainer.limit_val_batches=2 \ | |
trainer.precision=bf16 \ | |
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ | |
model.pipeline_model_parallel_size=1 \ | |
model.tensor_model_parallel_size=2 \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ | |
model.peft.peft_scheme='lora' \ | |
model.answer_only_loss=True \ | |
model.micro_batch_size=1 \ | |
model.global_batch_size=1 \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.train_ds.concat_sampling_probabilities=[1.0] \ | |
model.data.train_ds.num_workers=0 \ | |
model.data.validation_ds.num_workers=0 \ | |
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.validation_ds.names=[quarel] | |
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ | |
model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ | |
model.tensor_model_parallel_size=2 \ | |
trainer.devices=2 \ | |
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ | |
model.data.test_ds.names=['quarel4'] \ | |
model.global_batch_size=2 \ | |
model.micro_batch_size=1 \ | |
model.data.test_ds.tokens_to_generate=10 \ | |
model.data.test_ds.write_predictions_to_file=True \ | |
model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ | |
inference.greedy=True \ | |
inference.repetition_penalty=1.0 \ | |
inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl' | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/lora_tuning_tp2 | |
L2_Megatron_GPT_Eval: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_gpt_eval.py \ | |
gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ | |
prompts=['How to fix GPU memory? A:'] \ | |
tensor_model_parallel_size=1 \ | |
inference.tokens_to_generate=32 \ | |
trainer.precision=32 | |
L2_Megatron_GPT_Eval_PP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_gpt_eval.py \ | |
gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ | |
server=False \ | |
tensor_model_parallel_size=1 \ | |
pipeline_model_parallel_size=2 \ | |
trainer.devices=2 \ | |
trainer.num_nodes=1 \ | |
trainer.precision=32 | |
L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ | |
model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ | |
model.peft.restore_from_path=null \ | |
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \ | |
model.data.test_ds.names=[test] \ | |
model.data.test_ds.global_batch_size=1 \ | |
model.data.test_ds.micro_batch_size=1 \ | |
model.data.test_ds.tokens_to_generate=30 \ | |
model.data.test_ds.max_seq_length=6000 \ | |
model.data.test_ds.write_predictions_to_file=True \ | |
model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \ | |
inference.greedy=True \ | |
inference.repetition_penalty=1.0 \ | |
inference.outfile_path=examples/nlp/language_modeling/out.jsonl | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/out.jsonl | |
# TODO: Add this test back. Test was failing on CI machines due to HW error | |
# - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval | |
# when { | |
# anyOf { | |
# branch main | |
# changeRequest target: main | |
# } | |
# } | |
# failFast true | |
# - run: | | |
# python -m torch.distributed.launch --nproc_per_node=2 \ | |
# examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \ | |
# --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \ | |
# --checkpoint_name=model_optim_rng.pt \ | |
# --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \ | |
# --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \ | |
# --model_type=gpt \ | |
# --pipeline_model_parallel_size=1 \ | |
# --gpus_per_node=2 \ | |
# --tensor_model_parallel_size=2" | |
# python examples/nlp/language_modeling/megatron_gpt_eval.py \ | |
# --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \ | |
# --tokens_to_generate=32 \ | |
# --tensor_model_parallel_size=2 \ | |
# --prompt=This is a test. | |
# rm examples/nlp/language_modeling/small_gpt.nemo | |
# L2_Megatron_Change_Partitions | |
L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_change_num_partitions.py \ | |
--model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ | |
--target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ | |
--tensor_model_parallel_size 2 \ | |
--target_tensor_model_parallel_size 1 \ | |
--pipeline_model_parallel_size 1 \ | |
--target_pipeline_model_parallel_size 2 | |
AFTER_SCRIPT: | | |
rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo | |
L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_change_num_partitions.py \ | |
--model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ | |
--target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ | |
--tensor_model_parallel_size 2 \ | |
--target_tensor_model_parallel_size 4 \ | |
--pipeline_model_parallel_size 1 \ | |
--target_pipeline_model_parallel_size 1 | |
AFTER_SCRIPT: | | |
rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo | |
L2_Megatron_T5_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.position_embedding_type=relative \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=fast-swiglu \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=pre_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False \ | |
model.share_token_embeddings=False \ | |
model.share_decoder_tokens_head_embeddings=False | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.position_embedding_type=relative \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=fast-swiglu \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=pre_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False \ | |
model.share_token_embeddings=False \ | |
model.share_decoder_tokens_head_embeddings=False | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
rm -rf examples/nlp/language_modeling/t5_index_mappings | |
L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.max_epochs=null \ | |
trainer.max_steps=10 \ | |
trainer.val_check_interval=10 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.precision=bf16 \ | |
model.megatron_amp_O2=True \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.mcore_t5=True \ | |
model.transformer_engine=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.micro_batch_size=4 \ | |
model.global_batch_size=4 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.encoder.transformer_block_type='pre_ln' \ | |
model.decoder.transformer_block_type='pre_ln' \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False | |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.max_epochs=null \ | |
trainer.max_steps=10 \ | |
trainer.val_check_interval=10 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.precision=bf16 \ | |
model.megatron_amp_O2=True \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.mcore_t5=True \ | |
model.transformer_engine=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.micro_batch_size=4 \ | |
model.global_batch_size=4 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.encoder.transformer_block_type='pre_ln' \ | |
model.decoder.transformer_block_type='pre_ln' \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
rm -rf examples/nlp/language_modeling/t5_index_mappings | |
L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.position_embedding_type=alibi \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=swiglu \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=pre_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False \ | |
model.share_token_embeddings=False \ | |
model.share_decoder_tokens_head_embeddings=False | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.position_embedding_type=alibi \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=swiglu \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=pre_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False \ | |
model.share_token_embeddings=False \ | |
model.share_decoder_tokens_head_embeddings=False | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
rm -rf examples/nlp/language_modeling/t5_index_mappings | |
L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.position_embedding_type=kerple \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=swiglu \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=pre_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False \ | |
model.share_token_embeddings=False \ | |
model.share_decoder_tokens_head_embeddings=False | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.masked_softmax_fusion=False \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.position_embedding_type=kerple \ | |
model.decoder.num_layers=2 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=swiglu \ | |
model.decoder.masked_softmax_fusion=False \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=pre_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ | |
model.data.data_impl=text_mmap \ | |
+model.data.data_impl_kwargs.newline_int=10 \ | |
+model.data.data_impl_kwargs.header_lines=0 \ | |
+model.data.data_impl_kwargs.workers=null \ | |
+model.data.data_impl_kwargs.sort_dataset_paths=False \ | |
model.share_token_embeddings=False \ | |
model.share_decoder_tokens_head_embeddings=False | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
rm -rf examples/nlp/language_modeling/t5_index_mappings | |
L2_Megatron_T5_Pretraining_and_Resume_Training_PP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.pipeline_model_parallel_size=2 \ | |
model.pipeline_model_parallel_split_rank=1 \ | |
model.seq_length=256 \ | |
model.encoder.num_layers=4 \ | |
model.decoder.num_layers=1 \ | |
model.encoder.hidden_size=64 \ | |
model.decoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.ffn_hidden_size=2048 \ | |
model.encoder.activation=gelu \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=post_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.pipeline_model_parallel_size=2 \ | |
model.pipeline_model_parallel_split_rank=1 \ | |
model.seq_length=256 \ | |
model.encoder.num_layers=4 \ | |
model.decoder.num_layers=1 \ | |
model.encoder.hidden_size=64 \ | |
model.decoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.ffn_hidden_size=2048 \ | |
model.encoder.activation=gelu \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=post_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
rm -rf examples/nlp/language_modeling/t5_index_mappings | |
L2_Megatron_T5_w_Mixture_of_Expert_Pretraining: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.pipeline_model_parallel_split_rank=1 \ | |
model.seq_length=256 \ | |
model.encoder.num_layers=4 \ | |
model.decoder.num_layers=1 \ | |
model.encoder.num_moe_experts=4 \ | |
model.decoder.num_moe_experts=4 \ | |
model.encoder.moe_frequency=3 \ | |
model.decoder.moe_frequency=1 \ | |
model.encoder.hidden_size=64 \ | |
model.decoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.ffn_hidden_size=2048 \ | |
model.encoder.activation=gelu \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=pre_ln \ | |
model.decoder.transformer_block_type=post_ln \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
rm -rf examples/nlp/language_modeling/t5_index_mappings | |
L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=normformer \ | |
model.encoder.headscale=True \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=geglu \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.transformer_block_type=normformer \ | |
model.decoder.headscale=False \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=swiglu \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.encoder.transformer_block_type=normformer \ | |
model.encoder.headscale=True \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=geglu \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.transformer_block_type=normformer \ | |
model.decoder.headscale=False \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ | |
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
rm -rf examples/nlp/language_modeling/t5_index_mappings | |
L2_Megatron_T5_Eval: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_eval.py \ | |
--model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ | |
--prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \ | |
--tensor_model_parallel_size 1 | |
L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=3 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation='reglu' \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method='block' \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation='reglu' \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method='block' \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' | |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=2 \ | |
trainer.limit_val_batches=5 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=6 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.tensor_model_parallel_size=2 \ | |
model.seq_length=128 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation='reglu' \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method='block' \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation='reglu' \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method='block' \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/bart_pretrain_results | |
L2_Megatron_BART_Pretraining_and_Resume_Training_PP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ | |
model.pipeline_model_parallel_size=2 \ | |
model.pipeline_model_parallel_split_rank=1 \ | |
model.seq_length=256 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=geglu \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=geglu \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.data.respect_document_boundaries=False \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] | |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \ | |
trainer.devices=2 \ | |
trainer.accelerator=gpu \ | |
trainer.log_every_n_steps=1 \ | |
trainer.val_check_interval=1 \ | |
trainer.limit_val_batches=2 \ | |
trainer.accumulate_grad_batches=1 \ | |
trainer.max_steps=10 \ | |
trainer.precision=16 \ | |
trainer.gradient_clip_val=1.0 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ | |
exp_manager.resume_if_exists=True \ | |
model.pipeline_model_parallel_size=2 \ | |
model.pipeline_model_parallel_split_rank=1 \ | |
model.seq_length=256 \ | |
model.encoder.num_layers=4 \ | |
model.encoder.hidden_size=64 \ | |
model.encoder.num_attention_heads=8 \ | |
model.encoder.activation=geglu \ | |
model.encoder.bias_activation_fusion=False \ | |
model.encoder.activations_checkpoint_method=block \ | |
model.encoder.activations_checkpoint_num_layers=1 \ | |
model.decoder.num_layers=4 \ | |
model.decoder.hidden_size=64 \ | |
model.decoder.num_attention_heads=8 \ | |
model.decoder.activation=geglu \ | |
model.decoder.bias_activation_fusion=False \ | |
model.decoder.activations_checkpoint_method=block \ | |
model.decoder.activations_checkpoint_num_layers=1 \ | |
model.data.respect_document_boundaries=False \ | |
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/bart_pretrain_results | |
L2_Megatron_T5_PEFT_Lora_TP2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 | |
python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ | |
trainer.devices=2 \ | |
trainer.log_every_n_steps=1 \ | |
trainer.max_epochs=9999 \ | |
trainer.max_steps=3 \ | |
trainer.val_check_interval=3 \ | |
++trainer.limit_val_batches=2 \ | |
trainer.precision=16 \ | |
exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \ | |
model.pipeline_model_parallel_size=1 \ | |
model.tensor_model_parallel_size=2 \ | |
model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ | |
model.peft.peft_scheme=lora \ | |
model.answer_only_loss=True \ | |
model.micro_batch_size=1 \ | |
model.global_batch_size=1 \ | |
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.train_ds.concat_sampling_probabilities=[1.0] \ | |
model.data.train_ds.num_workers=0 \ | |
model.data.validation_ds.num_workers=0 \ | |
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ | |
model.data.validation_ds.names=[quarel] | |
python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ | |
model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ | |
model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ | |
model.peft.restore_from_ckpt_name=null \ | |
model.peft.restore_from_hparams_path=null \ | |
model.tensor_model_parallel_size=2 \ | |
trainer.devices=2 \ | |
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ | |
model.data.test_ds.names=[quarel4] \ | |
model.global_batch_size=2 \ | |
model.micro_batch_size=1 \ | |
model.data.test_ds.tokens_to_generate=10 \ | |
model.data.test_ds.write_predictions_to_file=True \ | |
model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \ | |
inference.greedy=True \ | |
inference.repetition_penalty=1.0 \ | |
inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl | |
AFTER_SCRIPT: | | |
rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 | |
# L2: Megatron Mock Data Generation | |
L2_Megatron_Mock_Data_Generation_MockGPTDataset: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ | |
trainer.max_steps=10 \ | |
trainer.limit_val_batches=7 \ | |
trainer.val_check_interval=10 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ | |
model.mcore_gpt=True \ | |
model.data.data_impl=mock \ | |
model.data.data_prefix=[] | |
L2_Megatron_Mock_Data_Generation_MockT5Dataset: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \ | |
trainer.max_steps=10 \ | |
trainer.limit_val_batches=3 \ | |
trainer.val_check_interval=10 \ | |
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ | |
model.data.data_impl=mock \ | |
model.data.data_prefix=[] | |
AFTER_SCRIPT: | | |
rm -rf examples/nlp/language_modeling/t5_pretrain_results | |
# L2: TTS Fast dev runs 1 | |
L2_TTS_Fast_dev_runs_1_Tacotron_2: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
python examples/tts/tacotron2.py \ | |
train_dataset=/home/TestData/an4_dataset/an4_train.json \ | |
validation_datasets=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.devices=1 \ | |
trainer.accelerator="gpu" \ | |
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ | |
trainer.strategy=auto \ | |
model.decoder.decoder_rnn_dim=256 \ | |
model.decoder.attention_rnn_dim=1024 \ | |
model.decoder.prenet_dim=128 \ | |
model.postnet.postnet_n_convolutions=3 \ | |
model.train_ds.dataloader_params.batch_size=4 \ | |
model.train_ds.dataloader_params.num_workers=0 \ | |
model.validation_ds.dataloader_params.batch_size=4 \ | |
model.validation_ds.dataloader_params.num_workers=0 \ | |
~model.text_normalizer \ | |
~model.text_normalizer_call_kwargs \ | |
~trainer.check_val_every_n_epoch | |
L2_TTS_Fast_dev_runs_1_WaveGlow: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/tts/waveglow.py \ | |
train_dataset=/home/TestData/an4_dataset/an4_train.json \ | |
validation_datasets=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.devices="[0]" \ | |
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ | |
trainer.strategy=auto \ | |
model.train_ds.dataloader_params.batch_size=4 \ | |
model.train_ds.dataloader_params.num_workers=0 \ | |
model.validation_ds.dataloader_params.batch_size=4 \ | |
model.validation_ds.dataloader_params.num_workers=0 \ | |
model.waveglow.n_flows=4 \ | |
model.waveglow.n_wn_layers=2 \ | |
model.waveglow.n_wn_channels=32 \ | |
~trainer.check_val_every_n_epoch | |
L2_TTS_Fast_dev_runs_1_FastPitch: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/tts/fastpitch.py \ | |
--config-name fastpitch_align_v1.05 \ | |
train_dataset=/home/TestData/an4_dataset/an4_train.json \ | |
validation_datasets=/home/TestData/an4_dataset/an4_val.json \ | |
sup_data_path=/home/TestData/an4_dataset/beta_priors \ | |
trainer.devices="[0]" \ | |
+trainer.limit_train_batches=1 \ | |
+trainer.limit_val_batches=1 \ | |
trainer.max_epochs=1 \ | |
trainer.strategy=auto \ | |
model.pitch_mean=212.35873413085938 \ | |
model.pitch_std=68.52806091308594 \ | |
model.train_ds.dataloader_params.batch_size=4 \ | |
model.train_ds.dataloader_params.num_workers=0 \ | |
model.validation_ds.dataloader_params.batch_size=4 \ | |
model.validation_ds.dataloader_params.num_workers=0 \ | |
model.symbols_embedding_dim=64 \ | |
model.input_fft.d_inner=384 \ | |
model.input_fft.n_layer=2 \ | |
model.output_fft.d_inner=384 \ | |
model.output_fft.n_layer=2 \ | |
~trainer.check_val_every_n_epoch \ | |
~model.text_normalizer \ | |
~model.text_normalizer_call_kwargs | |
# OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS: | |
# needs: [cicd-test-container-setup] | |
# runs-on: self-hosted-azure | |
# timeout-minutes: 10 | |
# container: | |
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
# options: | |
# # --user 0:128 | |
# --device=/dev/nvidia0 | |
# --gpus all | |
# --shm-size=8g | |
# --env TRANSFORMERS_OFFLINE=0 | |
# --env HYDRA_FULL_ERROR=1 | |
# --volume /mnt/datadrive/TestData:/home/TestData | |
# steps: | |
# - name: Checkout repository | |
# uses: actions/checkout@v4 | |
# - run: | | |
# python examples/tts/radtts.py \ | |
# train_dataset=/home/TestData/an4_dataset/an4_train.json \ | |
# validation_datasets=/home/TestData/an4_dataset/an4_val.json \ | |
# sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \ | |
# trainer.devices="[0]" \ | |
# +trainer.limit_train_batches=1 \ | |
# +trainer.limit_val_batches=1 \ | |
# trainer.max_epochs=1 \ | |
# trainer.strategy=auto \ | |
# model.pitch_mean=212.35873413085938 \ | |
# model.pitch_std=68.52806091308594 \ | |
# model.train_ds.dataloader_params.batch_size=4 \ | |
# model.train_ds.dataloader_params.num_workers=0 \ | |
# model.validation_ds.dataloader_params.batch_size=4 \ | |
# model.validation_ds.dataloader_params.num_workers=0 \ | |
# export_dir=/home/TestData/radtts_test \ | |
# model.optim.lr=0.0001 \ | |
# model.modelConfig.decoder_use_partial_padding=True \ | |
# ~trainer.check_val_every_n_epoch \ | |
# ~model.text_normalizer \ | |
# ~model.text_normalizer_call_kwargs | |
# #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
# # if: "failure()" | |
L2_TTS_Fast_dev_runs_1_Mixer-TTS: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/tts/mixer_tts.py \ | |
train_dataset=/home/TestData/an4_dataset/an4_train.json \ | |
validation_datasets=/home/TestData/an4_dataset/an4_val.json \ | |
sup_data_path=/home/TestData/an4_dataset/sup_data \ | |
trainer.devices="[0]" \ | |
+trainer.limit_train_batches=1 \ | |
+trainer.limit_val_batches=1 \ | |
trainer.max_epochs=1 \ | |
trainer.strategy=auto \ | |
model.pitch_mean=212.35873413085938 \ | |
model.pitch_std=68.52806091308594 \ | |
model.train_ds.dataloader_params.batch_size=4 \ | |
model.train_ds.dataloader_params.num_workers=0 \ | |
model.validation_ds.dataloader_params.batch_size=4 \ | |
model.validation_ds.dataloader_params.num_workers=0 \ | |
~trainer.check_val_every_n_epoch \ | |
~model.text_normalizer \ | |
~model.text_normalizer_call_kwargs | |
L2_TTS_Fast_dev_runs_1_Hifigan: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
SCRIPT: | | |
python examples/tts/hifigan.py \ | |
train_dataset=/home/TestData/an4_dataset/an4_train.json \ | |
validation_datasets=/home/TestData/an4_dataset/an4_val.json \ | |
trainer.devices="[0]" \ | |
+trainer.limit_train_batches=1 \ | |
+trainer.limit_val_batches=1 \ | |
+trainer.max_epochs=1 \ | |
trainer.strategy=auto \ | |
model.train_ds.dataloader_params.batch_size=4 \ | |
model.train_ds.dataloader_params.num_workers=0 \ | |
model.validation_ds.dataloader_params.batch_size=4 \ | |
model.validation_ds.dataloader_params.num_workers=0 \ | |
model.generator.upsample_initial_channel=64 \ | |
+model.debug=true \ | |
~trainer.check_val_every_n_epoch | |
# L2: NeRF | |
# L2_NeRF_DreamFusion: | |
# needs: [cicd-test-container-setup] | |
# runs-on: self-hosted-azure | |
# container: | |
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} | |
# options: | |
# # --user 0:128 | |
# --device=/dev/nvidia0 | |
# --gpus all | |
# --shm-size=8g | |
# --env TRANSFORMERS_OFFLINE=0 | |
# --env HYDRA_FULL_ERROR=1 | |
# --volume /mnt/datadrive/TestData:/home/TestData | |
# steps: | |
# - name: Checkout repository | |
# uses: actions/checkout@v4 | |
# - run: | | |
# python examples/multimodal/text_to_image/nerf/main.py \ | |
# trainer.num_nodes=1 \ | |
# trainer.devices="[0]" \ | |
# trainer.max_steps=1000 \ | |
# model.prompt="a DSLR photo of a delicious hamburger" \ | |
# exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results | |
# | |
# rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results | |
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" | |
# if: "failure()" | |
Speech_Checkpoints_tests: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure | |
TIMEOUT: 20 | |
SCRIPT: | | |
CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ | |
pretrained_name=QuartzNet15x5Base-En \ | |
dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ | |
batch_size=64 \ | |
tolerance=0.1012 | |
AFTER_SCRIPT: | | |
rm -f examples/asr/evaluation_transcripts.json | |
L2_Stable_Diffusion_Training: | |
needs: [cicd-test-container-setup] | |
uses: ./.github/workflows/_test_template.yml | |
with: | |
RUNNER: self-hosted-azure-gpus-1 | |
SCRIPT: | | |
rm -rf examples/multimodal/text_to_image/sd_train_results | |
python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ | |
trainer.devices=1 \ | |
trainer.max_steps=3 \ | |
+trainer.val_check_interval=10 \ | |
trainer.limit_val_batches=2 \ | |
trainer.gradient_clip_val=0 \ | |
exp_manager.exp_dir=examples/multimodal/text_to_image/sd_train_results \ | |
exp_manager.create_checkpoint_callback=False \ | |
exp_manager.resume_if_exists=False \ | |
model.resume_from_checkpoint=null \ | |
model.precision=16 \ | |
model.micro_batch_size=1 \ | |
model.global_batch_size=1 \ | |
model.first_stage_key=moments \ | |
model.cond_stage_key=encoded \ | |
+model.load_vae=False \ | |
+model.load_unet=False \ | |
+model.load_encoder=False \ | |
model.parameterization=v \ | |
model.load_only_unet=False \ | |
model.text_embedding_dropout_rate=0.0 \ | |
model.inductor=True \ | |
model.inductor_cudagraphs=False \ | |
model.capture_cudagraph_iters=15 \ | |
+model.unet_config.num_head_channels=64 \ | |
+model.unet_config.use_linear_in_transformer=True \ | |
model.unet_config.context_dim=1024 \ | |
model.unet_config.use_flash_attention=null \ | |
model.unet_config.resblock_gn_groups=16 \ | |
model.unet_config.unet_precision=fp16 \ | |
+model.unet_config.timesteps=1000 \ | |
model.optim.name=megatron_fused_adam \ | |
+model.optim.capturable=True \ | |
+model.optim.master_weights=True \ | |
model.optim.weight_decay=0.01 \ | |
model.first_stage_config.from_pretrained=null \ | |
model.data.num_workers=16 \ | |
model.data.synthetic_data=True | |
AFTER_SCRIPT: | | |
rm -rf examples/multimodal/text_to_image/sd_train_results | |
Nemo_CICD_Test: | |
needs: | |
- L0_Unit_Tests_GPU | |
- L0_Unit_Tests_CPU | |
- L2_Community_LLM_Checkpoints_tests_Llama | |
- L2_Community_LLM_Checkpoints_tests_StarCoder | |
- L2_Community_LLM_Checkpoints_tests_Falcon | |
#- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2 | |
- ASR_dev_run_Speech_to_Text | |
- ASR_dev_run_Speech_to_Text_WPE_-_CitriNet | |
- ASR_dev_run_Speech_Pre-training_-_CitriNet | |
- ASR_dev_run_Speech_To_Text_Finetuning | |
#- OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning | |
- ASR_dev_run_Speech_to_Text_WPE_-_Conformer | |
- ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer | |
- L2_Speech_to_Text_EMA | |
- L2_Speaker_dev_run_Speaker_Recognition | |
- L2_Speaker_dev_run_Speaker_Diarization | |
- L2_Speaker_dev_run_Speech_to_Label | |
- L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference | |
- L2_Speaker_dev_run_Clustering_Diarizer_Inference | |
- L2_Speaker_dev_run_Neural_Diarizer_Inference | |
- L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation | |
- L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader | |
- L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader | |
- L2_ASR_Adapters_Linear_Adapters | |
- L2_ASR_Adapters_RelPos_MHA_Adapters | |
- L2_Speech_Transcription_Speech_to_Text_Transcribe | |
- L2_Transducer_alignment_Running_pytest | |
- L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav | |
- L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 | |
- L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference | |
- L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference | |
- L2_Duplex_Text_Normalization_with_Tarred_dataset | |
- L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification | |
- L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification | |
- L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test | |
- L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test | |
- L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1 | |
- L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification | |
- L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation | |
- L2_Pretraining_BERT_pretraining_from_Text | |
- L2_Pretraining_BERT_from_Preprocessed | |
- L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN | |
- L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN | |
- L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation | |
- L2_NMT_Attention_is_All_You_Need_Inference | |
- L2_NMT_Attention_is_All_You_Need_Finetuning | |
- L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation | |
- L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation | |
- L2_Megatron_NMT_Training_TP2 | |
- L2_Megatron_BART_Perceiver_MIM_Training_TP2 | |
- L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism | |
- L2_Megatron_Bert_Pretraining_and_Resume_Training | |
- L2_Megatron_Core_Bert_Pretraining_and_Resume_Training | |
- L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training | |
- L2_Megatron_RETRO_Pretraining_and_Resume_Training | |
- L2_RAG_Pipeline_Indexing | |
- L2_RAG_Pipeline_Generating | |
- L2_BioMegatron_Bert_NER_Task | |
- L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2 | |
- L2_Megatron_GPT_Finetuning_PP2 | |
- L2_Megatron_GPT_Finetuning_StarCoder_PP1 | |
- L2_Megatron_GPT_Embedding | |
- L2_Megatron_GPT_PEFT_Lora_PP2_O2 | |
- L2_Megatron_GPT_PEFT_Lora_TP2_O1 | |
- L2_Megatron_GPT_Eval | |
- L2_Megatron_GPT_Eval_PP2 | |
- L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len | |
- L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2 | |
- L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2 | |
- L2_Megatron_T5_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_T5_Pretraining_and_Resume_Training_PP2 | |
- L2_Megatron_T5_w_Mixture_of_Expert_Pretraining | |
- L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_T5_Eval | |
- L2_Megatron_BART_Pretraining_and_Resume_Training_TP2 | |
- L2_Megatron_BART_Pretraining_and_Resume_Training_PP2 | |
- L2_Megatron_T5_PEFT_Lora_TP2 | |
- L2_Megatron_Mock_Data_Generation_MockGPTDataset | |
- L2_Megatron_Mock_Data_Generation_MockT5Dataset | |
- L2_TTS_Fast_dev_runs_1_Tacotron_2 | |
- L2_TTS_Fast_dev_runs_1_WaveGlow | |
- L2_TTS_Fast_dev_runs_1_FastPitch | |
#- OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS | |
- L2_TTS_Fast_dev_runs_1_Mixer-TTS | |
- L2_TTS_Fast_dev_runs_1_Hifigan | |
- Speech_Checkpoints_tests | |
- L2_Stable_Diffusion_Training | |
if: always() | |
runs-on: ubuntu-latest | |
steps: | |
- if: ${{ always() }} | |
id: pipeline-conclusion | |
run: | | |
# Slack notifications are send only on test failure (not cancelled): | |
FAILED=${{ contains(needs.*.outputs.conclusion, 'failure') }} | |
echo "FAILED=$FAILED" >> $GITHUB_OUTPUT | |
# Mark as successful if no job was cancelled: | |
SUCCESS=${{ !contains(needs.*.result, 'cancelled') }} | |
echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT | |
# This should depend on all the tests so we block/unblock based on all tests passing | |
- if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }} | |
run: exit 0 | |
- if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }} | |
name: Checkout repository | |
uses: actions/checkout@v4 | |
- if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }} | |
env: | |
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} | |
run: | | |
set -x | |
PR_INFO=$(curl -L \ | |
-H "Accept: application/vnd.github+json" \ | |
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ | |
-H "X-GitHub-Api-Version: 2022-11-28" \ | |
https://api.github.com/repos/${{ github.repository }}/pulls/${{ github.event.number }} | |
) | |
PR_URL=$(echo -E $PR_INFO | jq '.html_url' | tr -d '"') | |
PR_TITLE=$(echo -E $PR_INFO | jq '.title' | tr -d '"') | |
PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} | |
BASE_MESSAGE=' | |
{ | |
"blocks": [ | |
{ | |
"type": "section", | |
"text": { | |
"type": "mrkdwn", | |
"text": "🚨 *CI/CD failure at <'$PIPELINE_URL'|NeMo CI>*." | |
} | |
} | |
] | |
} | |
' | |
# We are close to reaching 100 jobs: Once we break that barrier, we have to iterate pages | |
JOBS_URL="https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" | |
SUMMARY="[]" | |
while IFS= read -r JOB; do | |
JOB_NAME="$(echo $JOB | jq '.key' | tr -d '"') / main" | |
JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" $JOBS_URL | jq --arg job_name "$JOB_NAME" -r '.jobs[] | select(.name == $job_name) | .id') | |
JOB_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}/job/$JOB_ID" | |
LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"') | |
SUMMARY=$(echo "$SUMMARY" | jq \ | |
--arg pr "<$PR_URL|$PR_TITLE>" \ | |
--arg job "<$JOB_URL|$JOB_NAME>" \ | |
--arg logs "$LOGS" \ | |
--arg author "<https://github.com/${{ github.actor }}|${{ github.actor }}>" \ | |
--arg branch "<https://github.com/${{ github.repository }}/tree/${{ github.head_ref || github.ref_name }}|${{ github.head_ref || github.ref_name }}>"\ | |
'. += [ | |
{ | |
"type": "section", | |
"text": { | |
"type": "mrkdwn", | |
"text": ( | |
"PR: " + $pr | |
+ "\nJob: " + $job | |
+ "\nAuthor: " + $author | |
+ "\nBranch: " + $branch | |
+ "\nLogs:" | |
+ "```\n" + $logs + "\n```" | |
) | |
} | |
} | |
]') | |
done <<<$(echo '${{ toJSON(needs) }}' | jq -c 'to_entries | .[] | select(.value.outputs.conclusion == "failure")') | |
MESSAGE=$(echo $BASE_MESSAGE | jq -c --argjson summary "$SUMMARY" '.blocks += $summary') | |
curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }} | |
- if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} | |
run: | | |
exit 1 |