Skip to content

Add TRT-LLM params like max_num_tokens and opt_num_tokens #2057

Add TRT-LLM params like max_num_tokens and opt_num_tokens

Add TRT-LLM params like max_num_tokens and opt_num_tokens #2057

Workflow file for this run

# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "CICD NeMo"
on:
pull_request:
branches:
- 'main'
- 'r**'
types: [ labeled ]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
gpu-test:
runs-on: self-hosted-azure
if: ${{ github.event.label.name == 'Run CICD' }}
steps:
- name: Run nvidia-smi test
run: |
whoami
nvidia-smi
cicd-cluster-clean:
runs-on: self-hosted-azure-builder
if: ${{ github.event.label.name == 'Run CICD' }}
steps:
- name: Clean server from old files
run: |
docker container prune --filter "until=24h" --force
docker image prune -a --filter "until=24h" --force
# checkout-repository:
# runs-on: self-hosted-azure
# container:
# image: nvcr.io/nvidia/pytorch:24.02-py3
# volumes:
# - ${{ github.workspace }}:/workspace
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# with:
# path: ${{ github.run_id }}
cicd-test-container-setup:
needs: [cicd-cluster-clean]
runs-on: self-hosted-azure-builder
if: ${{ github.event.label.name == 'Run CICD' }}
# uses: actions/cache@v2
#container:
# image: nvcr.io/nvidia/pytorch:24.02-py3
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
# We use `docker` driver as this speeds things up for
# trivial (non-multi-stage) builds.
driver: docker
- name: Build and push
uses: docker/build-push-action@v5
with:
file: Dockerfile.ci
push: true
cache-from: nemoci.azurecr.io/nemo_container:latest
cache-to: type=inline
tags: |
nemoci.azurecr.io/nemo_container_${{ github.run_id }}
nemoci.azurecr.io/nemo_container:latest
- name: Run some checks
run: |
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\
# PyTorch Lightning version
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
# PyTorch Lightning DDP Checks
CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
# Basic Import Checks
python -c "import nemo.collections.asr as nemo_asr"
python -c "import nemo.collections.nlp as nemo_nlp"
python -c "import nemo.collections.tts as nemo_tts"
python setup.py style
python tests/check_copyright_header.py --dir .
# These checks are not crucial
exit 0
'
# - name: Build and push to local registry
# uses: docker/build-push-action@v5
# with:
# context: .
# push: true
# tags: nemoci.azurecr.io/name/app:latest
# - name: Inspect
# run: |
# docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest
#- name: Post-workflow execution
# uses: gacts/run-and-post-run@v1
# with:
# post: |
# chmod -R 777 .
L0_Unit_Tests_GPU:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: "L0: Unit Tests GPU"
run: |
NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L0_Unit_Tests_CPU:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-cpu
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: "L0: Unit Tests CPU"
run: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L0_Setup_Test_Data_And_Models:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python -m tests.setup --save_dir /home/TestData/nlp
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
## - name: L2: Multimodal Imagen Train
# L2: Community LLM Checkpoints tests
L2_Community_LLM_Checkpoints_tests_Llama:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
--input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
--output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
--precision=16
- name: Cleanup
if: "always()"
run: |
rm -rf /home/TestData/nlp/megatron_llama/model_weights
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Community_LLM_Checkpoints_tests_Llama3:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v2
- run: |
CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
--input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \
--output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \
--precision=16
rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo
- name: Cleanup
if: "always()"
run: |
rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Community_LLM_Checkpoints_tests_StarCoder:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }};
python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
--input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
--output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}
- name: Cleanup
if: "always()"
run: |
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo;
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Community_LLM_Checkpoints_tests_Falcon:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
--input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
--output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
- name: Cleanup
if: "always()"
run: |
rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# this test is using a 7B model which is too large for GitHub CI
# replace the model in this test with a toy model or move the test
# to the nightly CI
# L2_Community_LLM_Checkpoints_tests_Baichuan2:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
# --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
# --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
# rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"
L2_PTQ_Llama2_Export_Only:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.algorithm=null \
model_save=/home/TestData/nlp/megatron_llama/ci_baseline
rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_PTQ_Llama2_FP8:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
tensor_model_parallel_size=2 \
trainer.devices=2 \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=fp8 \
quantization.num_calib_size=8 \
inference.batch_size=2 \
export.inference_tensor_parallel=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_PTQ_Llama2_INT8_SQ:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int8_sq \
quantization.num_calib_size=8 \
inference.batch_size=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# TODO: investigate int4_awq stuck issues and restore the test
#L2_PTQ_Llama2_INT4_AWQ:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/megatron_quantization.py \
# model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# tensor_model_parallel_size=1 \
# trainer.devices=1 \
# quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
# quantization.algorithm=int4_awq \
# quantization.num_calib_size=8 \
# inference.batch_size=2 \
# model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
#
# rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
#- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"
# L2: ASR dev run
ASR_dev_run_Speech_to_Text:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_ctc/speech_to_text_ctc.py \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_to_text_results
rm -rf examples/asr/speech_to_text_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
--config-path="../conf/citrinet/" --config-name="config_bpe" \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
model.tokenizer.type="wpe" \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
rm -rf examples/asr/speech_to_text_wpe_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
ASR_dev_run_Speech_Pre-training_-_CitriNet:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/speech_pretraining/speech_pre_training.py \
--config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_pre_training_results
rm -rf examples/asr/speech_pre_training_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
ASR_dev_run_Speech_To_Text_Finetuning:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/speech_to_text_finetune.py \
--config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
model.tokenizer.update_tokenizer=False \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_finetuning_results
rm -rf examples/asr/speech_finetuning_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/speech_to_text_finetune.py \
--config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
~model.train_ds.hf_data_cfg \
model.train_ds.num_workers=1 \
model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
model.train_ds.streaming=true \
+model.train_ds.hf_data_cfg.path="librispeech_asr" \
+model.train_ds.hf_data_cfg.name=null \
+model.train_ds.hf_data_cfg.split="test.clean" \
+model.train_ds.hf_data_cfg.streaming=true \
~model.validation_ds.hf_data_cfg \
model.validation_ds.streaming=true \
+model.validation_ds.hf_data_cfg.path="librispeech_asr" \
+model.validation_ds.hf_data_cfg.name=null \
+model.validation_ds.hf_data_cfg.split="test.clean" \
+model.validation_ds.hf_data_cfg.streaming=true \
~model.test_ds \
init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
model.tokenizer.update_tokenizer=False \
model.optim.sched.warmup_steps=0 \
+model.optim.sched.max_steps=3 \
trainer.max_epochs=null \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_finetuning_results
rm -rf examples/asr/speech_finetuning_results
#- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"
ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
--config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
model.tokenizer.type="wpe" \
model.train_ds.batch_size=4 \
model.validation_ds.batch_size=4 \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
rm -rf examples/asr/speech_to_text_wpe_conformer_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: ASR dev run - part two
ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
--config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
model.tokenizer.type="wpe" \
model.encoder.d_model=144 \
model.train_ds.batch_size=4 \
model.validation_ds.batch_size=4 \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Speech_to_Text_EMA:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_ctc/speech_to_text_ctc.py \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
trainer.devices=2 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
+exp_manager.ema.enable=True \
exp_manager.exp_dir=examples/asr/speech_to_text_results
rm -rf examples/asr/speech_to_text_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2_Speech_to_Text_AED:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure-gpus-1
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/asr/speech_multitask/speech_to_text_aed.py \
# model.prompt_format=canary \
# model.model_defaults.asr_enc_hidden=256 \
# model.model_defaults.lm_dec_hidden=256 \
# model.encoder.n_layers=12 \
# model.transf_encoder.num_layers=0 \
# model.transf_decoder.config_dict.num_layers=12 \
# model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
# ++model.train_ds.is_tarred=false \
# model.train_ds.batch_duration=60 \
# +model.train_ds.text_field="answer" \
# +model.train_ds.lang_field="target_lang" \
# model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
# +model.validation_ds.text_field="answer" \
# +model.validation_ds.lang_field="target_lang" \
# model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
# +model.test_ds.text_field="answer" \
# +model.test_ds.lang_field="target_lang" \
# model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
# model.tokenizer.langs.spl_tokens.type="bpe" \
# model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
# model.tokenizer.langs.en.type=bpe \
# ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
# ++model.tokenizer.langs.es.type=bpe \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.use_distributed_sampler=false \
# +trainer.fast_dev_run=True \
# exp_manager.exp_dir=examples/asr/speech_to_text_aed_results
# rm -rf examples/asr/speech_to_text_results
# L2: Speaker dev run
L2_Speaker_dev_run_Speaker_Recognition:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/speaker_tasks/recognition/speaker_reco.py \
model.train_ds.batch_size=10 \
model.validation_ds.batch_size=2 \
model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
model.decoder.num_classes=2 \
trainer.max_epochs=10 \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Speaker_dev_run_Speaker_Diarization:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
model.diarizer.speaker_embeddings.model_path=titanet_large \
model.train_ds.batch_size=5 \
model.validation_ds.batch_size=5 \
model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Speaker_dev_run_Speech_to_Label:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/speech_classification/speech_to_label.py \
model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
~model.preprocessor.window_size \
~model.preprocessor.window_stride \
~model.preprocessor.window \
~model.preprocessor.n_mels \
~model.preprocessor.n_mfcc \
~model.preprocessor.n_fft \
exp_manager.exp_dir=examples/asr/speech_to_label_results
rm -rf examples/asr/speech_to_label_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
diarizer.speaker_embeddings.parameters.save_embeddings=True \
diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
diarizer.asr.model_path=QuartzNet15x5Base-En \
diarizer.asr.parameters.asr_based_vad=True \
diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Speaker_dev_run_Clustering_Diarizer_Inference:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
diarizer.speaker_embeddings.parameters.save_embeddings=True \
diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
diarizer.speaker_embeddings.parameters.multiscale_weights=null \
diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Speaker_dev_run_Neural_Diarizer_Inference:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
diarizer.speaker_embeddings.parameters.save_embeddings=True \
diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python tools/speech_data_simulator/multispeaker_simulator.py \
--config-path=conf --config-name=data_simulator.yaml \
data_simulator.random_seed=42 \
data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
data_simulator.outputs.output_dir=./test_simulator \
data_simulator.session_config.num_sessions=2 \
data_simulator.session_config.session_length=60
rm -rf ./test_simulator
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: ASR Multi-dataloader dev run
L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_ctc/speech_to_text_ctc.py \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
trainer.devices=1 \
trainer.accelerator="gpu" \
trainer.max_epochs=1 \
trainer.max_steps=1 \
+trainer.num_sanity_val_steps=1 \
exp_manager.exp_dir=examples/asr/speech_to_text_results
rm -rf examples/asr/speech_to_text_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/speech_classification/speech_to_label.py \
model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
trainer.devices=1 \
trainer.accelerator="gpu" \
trainer.max_epochs=1 \
trainer.max_steps=1 \
+trainer.num_sanity_val_steps=1 \
model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
~model.preprocessor.window_size \
~model.preprocessor.window_stride \
~model.preprocessor.window \
~model.preprocessor.n_mels \
~model.preprocessor.n_mfcc \
~model.preprocessor.n_fft \
exp_manager.exp_dir=examples/asr/speech_to_label_results
rm -rf examples/asr/speech_to_label_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: ASR Adapters
L2_ASR_Adapters_Linear_Adapters:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_adapters/train_asr_adapter.py \
model.pretrained_model="stt_en_conformer_ctc_small" \
model.adapter.adapter_name="an4" \
model.adapter.linear.in_features=176 \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
trainer.max_steps=5 \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
rm -rf examples/asr/speech_to_text_adapters_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_ASR_Adapters_RelPos_MHA_Adapters:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/asr_adapters/train_asr_adapter.py \
model.pretrained_model="stt_en_conformer_ctc_small" \
model.adapter.adapter_name="encoder:an4" \
model.adapter.adapter_type="tiny_attn" \
model.adapter.tiny_attn.n_feat=176 \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
trainer.max_steps=5 \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
rm -rf examples/asr/speech_to_text_adapters_mha_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Speech Transcription
L2_Speech_Transcription_Speech_to_Text_Transcribe:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/asr/transcribe_speech.py \
pretrained_name="QuartzNet15x5Base-En" \
audio_dir="/home/TestData/an4_transcribe/test_subset/" \
output_filename="stt_test_res.json" \
amp=true
rm -rf stt_test_res.json
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Transducer alignment
L2_Transducer_alignment_Running_pytest:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Segmentation Tool
L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd tools/ctc_segmentation && \
TIME=`date +"%Y-%m-%d-%T"` && \
/bin/bash run_segmentation.sh \
--MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
--DATA_DIR=/home/TestData/ctc_segmentation/eng \
--OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
--LANGUAGE=en \
--USE_NEMO_NORMALIZATION="TRUE" && \
python /home/TestData/ctc_segmentation/verify_alignment.py \
-r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
-g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd tools/ctc_segmentation && \
TIME=`date +"%Y-%m-%d-%T"` && \
/bin/bash run_segmentation.sh \
--MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
--DATA_DIR=/home/TestData/ctc_segmentation/ru \
--OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
--LANGUAGE=ru \
--ADDITIONAL_SPLIT_SYMBOLS=";" && \
python /home/TestData/ctc_segmentation/verify_alignment.py \
-r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
-g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: G2P Models
L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/tts/g2p && \
TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
python g2p_train_and_evaluate.py \
train_manifest=/home/TestData/g2p/g2p.json \
validation_manifest=/home/TestData/g2p/g2p.json \
model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
trainer.max_epochs=1 \
model.max_source_len=64 \
trainer.devices=1 \
do_training=True \
do_testing=True \
exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
+exp_manager.use_datetime_version=False\
+exp_manager.version=test \
--config-name=g2p_conformer_ctc && \
python g2p_inference.py \
pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
manifest_filepath=/home/TestData/g2p/g2p.json \
phoneme_field=text
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# TODO: pleasefixme @redoctopus
# - name: ByT5G2P training, evaluation and inference
# run: |
# cd examples/tts/g2p && \
# TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
# python g2p_train_and_evaluate.py \
# train_manifest=/home/TestData/g2p/g2p.json \
# validation_manifest=/home/TestData/g2p/g2p.json \
# model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
# trainer.max_epochs=1 \
# model.max_source_len=64 \
# trainer.devices=1 \
# do_training=True \
# do_testing=True \
# exp_manager.exp_dir=${OUTPUT_DIR_T5} \
# +exp_manager.use_datetime_version=False\
# +exp_manager.version=test && \
# python g2p_inference.py \
# pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \
# manifest_filepath=/home/TestData/g2p/g2p.json \
# phoneme_field=text
# }
# }
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"
L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/tts/g2p && \
TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
python g2p_heteronym_classification_train_and_evaluate.py \
train_manifest=/home/TestData/g2p/manifest.json \
validation_manifest=/home/TestData/g2p/manifest.json \
test_manifest=/home/TestData/g2p/manifest.json \
model.wordids=/home/TestData/g2p/wordids.tsv \
trainer.max_epochs=1 \
model.max_seq_length=64 \
do_training=True \
do_testing=True \
exp_manager.exp_dir=${OUTPUT_DIR} \
+exp_manager.use_datetime_version=False\
+exp_manager.version=test && \
python g2p_heteronym_classification_inference.py \
manifest=/home/TestData/g2p/manifest.json \
pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
output_manifest=preds.json
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Dialogue Classification
# TODO: pleasefixme
# L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure-gpus-1
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# cd examples/nlp/dialogue && \
# python dialogue.py \
# model.dataset.data_dir=/home/TestData/nlp/sgd_small \
# model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
# model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
# model.dataset.dialogues_example_dir=sgd_gen_outputs \
# model.dataset.task_name=debug_sample \
# trainer.max_steps=1 \
# trainer.max_epochs=1 \
# model.train_ds.batch_size=2 \
# model.validation_ds.batch_size=2 \
# model.test_ds.batch_size=2 \
# model.nemo_path=null \
# trainer.val_check_interval=0.0 \
# trainer.devices=1 \
# model.dataset.use_cache=false \
# model.tokenizer.special_tokens={pad_token:"endoftext"} \
# model.tokenizer.tokenizer_name=gpt2 \
# model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
# model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
# trainer.accelerator=gpu \
# exp_manager=null && \
# rm -rf sgd_gen_outputs
L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
model.dataset.data_dir=/home/TestData/nlp/sgd_small \
model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
model.dataset.task_name=debug_sample \
trainer.max_steps=1 \
trainer.max_epochs=1 \
model.train_ds.batch_size=2 \
model.validation_ds.batch_size=2 \
model.test_ds.batch_size=2 \
model.dataset.num_tasks=6 \
model.nemo_path=null \
trainer.val_check_interval=0.0 \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=bert-base-cased \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf sgd_gen_bert_outputs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
model.dataset.task=assistant \
trainer.max_steps=1 \
trainer.max_epochs=1 \
model.train_ds.batch_size=2 \
model.validation_ds.batch_size=2 \
model.test_ds.batch_size=2 \
model.nemo_path=null \
trainer.val_check_interval=0.0 \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=bert-base-uncased \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf sgd_gen_bert_intent_classification_outputs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
model.dataset.task=zero_shot \
model.dataset.prompt_template="This example is" \
trainer.max_steps=1 \
trainer.max_epochs=1 \
model.train_ds.batch_size=2 \
model.validation_ds.batch_size=2 \
model.test_ds.batch_size=2 \
model.nemo_path=null \
trainer.val_check_interval=0.0 \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=bert-base-uncased \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf sgd_gen_zero_shot_intent_classification_outputs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/design_dataset \
model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
model.dataset.task=design \
model.dataset.prompt_template="This example is related to" \
model.library=megatron \
trainer.max_steps=1 \
trainer.max_epochs=1 \
model.train_ds.batch_size=2 \
model.validation_ds.batch_size=2 \
model.test_ds.batch_size=2 \
model.nemo_path=null \
trainer.val_check_interval=0.0 \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=bert-base-uncased \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf design_zero_shot_intent_classification_outputs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/design_dataset \
model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
model.dataset.task=design \
model.dataset.prompt_template="This example is related to" \
model.library=huggingface \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=bert-base-uncased \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf design_zero_shot_intent_classification_bart_outputs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/design_dataset \
model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
model.dataset.task=design \
model.dataset.prompt_template="" \
model.library=huggingface \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf design_dialogue_nearest_neighbour_classification_outputs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Dialogue Generation
L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
model.dataset.dialogues_example_dir=answer_extender_s2s \
model.dataset.task=ms_marco \
model.library=huggingface \
model.dataset.debug_mode=True \
trainer.max_steps=1 \
trainer.max_epochs=1 \
model.train_ds.batch_size=2 \
model.validation_ds.batch_size=2 \
model.test_ds.batch_size=2 \
model.nemo_path=null \
trainer.val_check_interval=0.0 \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=facebook/bart-large \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf answer_extender_s2s
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/sgd_small \
model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
model.dataset.task_name=debug_sample \
model.dataset.task=sgd_generation \
model.dataset.input_field=utterance+system_actions \
model.dataset.output_field=system_utterance \
model.dataset.use_cache=false \
model.dataset.system_utterance=next_turn \
model.dataset.debug_mode=True \
model.dataset.prompt_template=slots_values \
model.library=huggingface \
trainer.max_steps=1 \
trainer.max_epochs=1 \
model.train_ds.batch_size=2 \
model.validation_ds.batch_size=2 \
model.test_ds.batch_size=2 \
model.nemo_path=null \
trainer.val_check_interval=0.0 \
trainer.devices=1 \
model.language_model.pretrained_model_name=facebook/bart-large \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf sgd_answer_extender_s2s
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# - name: L2: Dialogue Generation Part 2
# when {
# anyOf {
# branch main
# changeRequest target: main
# }
# }
# failFast true
# parallel {
# - name: Dialogue: Answer Extender using DialogueGPTGenerationModel
# - run: |
# cd examples/nlp/dialogue && \
# python dialogue.py \
# do_training=False \
# model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
# model.dataset.dialogues_example_dir=answer_extender \
# model.library=huggingface \
# model.dataset.task=ms_marco \
# model.dataset.debug_mode=True \
# trainer.val_check_interval=0.0 \
# trainer.devices=1 \
# model.dataset.use_cache=false \
# model.language_model.pretrained_model_name=gpt2 \
# trainer.accelerator=gpu \
# exp_manager=null && \
# rm -rf answer_extender
# }
# }
# }
# }
# L2: COPY
L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
model.dataset.dialogues_example_dir=answer_extender \
model.library=huggingface \
model.dataset.task=ms_marco \
model.dataset.debug_mode=True \
trainer.val_check_interval=0.0 \
trainer.devices=1 \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name=gpt2 \
trainer.accelerator=gpu \
exp_manager=null && \
rm -rf answer_extender
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Duplex Text Normalization
L2_Duplex_Text_Normalization_with_Tarred_dataset:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/duplex_text_normalization && \
python duplex_text_normalization_train.py \
data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
mode=tn \
lang=en \
tagger_model.do_training=false \
decoder_model.transformer=t5-small \
data.validation_ds.batch_size=2 \
data.train_ds.use_cache=false \
data.validation_ds.use_cache=false \
data.test_ds.batch_size=2 \
data.train_ds.decoder_data_augmentation=false \
data.train_ds.num_workers=2 \
decoder_trainer.devices=[0,1] \
decoder_trainer.accelerator="gpu" \
data.train_ds.use_tarred_dataset=true \
+decoder_trainer.fast_dev_run=true \
decoder_exp_manager.create_checkpoint_callback=false \
data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
data.test_ds.use_cache=false \
data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
# TODO: add when megatron bert is supported again in NeMo
# - name: L2: MegaBERT Token Classification
# when {
# anyOf {
# branch main
# changeRequest target: main
# }
# }
# failFast true
# - run: |
# cd examples/nlp/token_classification && \
# python token_classification_train.py \
# model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
# model.language_model.pretrained_model_name=megatron-bert-345m-uncased \
# model.train_ds.batch_size=10 \
# model.dataset.max_seq_length=50 \
# model.dataset.use_cache=false \
# trainer.accelerator=gpu \
# trainer.strategy=ddp \
# trainer.precision=16 \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=true \
# exp_manager=null
# }
# }
# L2: BERT Text Classification
L2_BERT_Text_Classification_with_BERT_Test:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/text_classification && \
python text_classification_with_bert.py \
model.dataset.num_classes=6 \
model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
model.language_model.pretrained_model_name=distilbert-base-uncased \
model.train_ds.batch_size=10 \
model.dataset.max_seq_length=50 \
model.dataset.use_cache=false \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
# Cannot do fast_dev_run because squad needs whole dev dataset
cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.dataset.use_cache=false \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
model.test_ds.num_samples=2 \
model.test_ds.batch_size=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.language_model.pretrained_model_name=bert-base-uncased \
model.dataset.version_2_with_negative=false \
trainer.precision=16 \
trainer.devices=1 \
trainer.accelerator="gpu" \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
# Cannot do fast_dev_run because squad needs whole dev dataset
cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
model.language_model.pretrained_model_name=bert-base-uncased \
model.dataset.version_2_with_negative=true \
trainer.precision=16 \
trainer.devices=1 \
trainer.accelerator="gpu" \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.dataset.use_cache=false \
model.dataset.check_if_answer_in_context=false \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
model.test_ds.num_samples=2 \
model.test_ds.batch_size=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.language_model.pretrained_model_name=facebook/bart-base \
model.dataset.version_2_with_negative=false \
trainer.precision=16 \
trainer.devices=1 \
trainer.accelerator="gpu" \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
model.dataset.check_if_answer_in_context=false \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
model.language_model.pretrained_model_name=facebook/bart-base \
model.dataset.version_2_with_negative=true \
trainer.precision=16 \
trainer.devices=1 \
trainer.accelerator="gpu" \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.dataset.use_cache=false \
model.dataset.check_if_answer_in_context=false \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
model.test_ds.num_samples=2 \
model.test_ds.batch_size=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.language_model.pretrained_model_name=gpt2 \
model.dataset.version_2_with_negative=false \
trainer.precision=16 \
trainer.devices=1 \
trainer.accelerator="gpu" \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
model.dataset.check_if_answer_in_context=false \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
model.language_model.pretrained_model_name=gpt2 \
model.dataset.version_2_with_negative=true \
trainer.precision=16 \
trainer.devices=1 \
trainer.accelerator="gpu" \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Intent and Slot Classification Tasks
L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/intent_slot_classification && \
python intent_slot_classification.py \
model.data_dir=/home/TestData/nlp/retail \
model.validation_ds.prefix=dev \
model.test_ds.prefix=dev \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
exp_manager.exp_dir=checkpoints
rm -rf checkpoints
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/intent_slot_classification && \
python multi_label_intent_slot_classification.py \
model.data_dir=/home/TestData/nlp/new_multiatis \
model.validation_ds.prefix=dev \
model.test_ds.prefix=dev \
trainer.devices=1 \
+trainer.fast_dev_run=true \
exp_manager.exp_dir=checkpoints2
rm -rf checkpoints2
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# TODO: add when megatron-bert is supported again
# stage('L2: Model Parallel Size 2 Megatron Text Classification') {
# when {
# anyOf{
# branch 'main'
# changeRequest target: 'main'
# }
# }
# failFast true
# steps{
# cd examples/nlp/text_classification && \
# python text_classification_with_bert.py \
# trainer.devices=[0,1] \
# trainer.accelerator="gpu" \
# trainer.num_nodes=1 \
# trainer.precision=16 \
# trainer.gradient_clip_val=1.0 \
# +trainer.fast_dev_run=true \
# model.dataset.num_classes=6 \
# model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
# model.train_ds.batch_size=4 \
# model.language_model.pretrained_model_name=megatron-bert-uncased \
# model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
# model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
# model.nemo_path=null \
# ~model.infer_samples \
# exp_manager=null
# }
# }
# stage('L2: Model Parallel Size 2 Megatron Autoresume') {
# when {
# anyOf{
# branch 'main'
# changeRequest target: 'main'
# }
# }
# failFast true
# steps{
# cd examples/nlp/text_classification && \
# python text_classification_with_bert.py \
# trainer.devices=[0,1] \
# trainer.accelerator="gpu" \
# trainer.num_nodes=1 \
# trainer.precision=16 \
# trainer.gradient_clip_val=1.0 \
# trainer.max_epochs=1 \
# +trainer.fast_dev_run=true \
# model.dataset.num_classes=6 \
# model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
# model.train_ds.batch_size=4 \
# model.language_model.pretrained_model_name=megatron-bert-uncased \
# model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
# model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
# model.nemo_path=null \
# ~model.infer_samples \
# +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
# +exp_manager.resume_if_exists=true
# }
# }
# stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
# when {
# anyOf{
# branch 'main'
# changeRequest target: 'main'
# }
# }
# failFast true
# steps{
# cd examples/nlp/text_classification && \
# python model_parallel_text_classification_evaluation.py \
# trainer.devices=[0,1] \
# trainer.accelerator="gpu" \
# trainer.num_nodes=1 \
# model.dataset.num_classes=6 \
# model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
# model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
# exp_manager=null
# }
# }
# stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
# when {
# anyOf{
# branch 'main'
# changeRequest target: 'main'
# }
# }
# failFast true
# steps{
# cd examples/nlp/token_classification && \
# python token_classification_train.py \
# pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
# model.dataset.data_dir=/home/TestData/nlp/ner/ \
# model.train_ds.batch_size=2 \
# model.dataset.use_cache=false \
# trainer.devices=[0,1] \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=true \
# model.dataset.class_balancing="weighted_loss" \
# exp_manager=null
# }
# }
# L2: Parallel NLP Examples 2
L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/token_classification && \
python token_classification_train.py \
pretrained_model=ner_en_bert \
model.dataset.data_dir=/home/TestData/nlp/ner/ \
model.train_ds.batch_size=2 \
model.dataset.use_cache=false \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
model.dataset.class_balancing="weighted_loss" \
exp_manager.exp_dir=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/token_classification && \
data_dir="$(mktemp -d -p "$(pwd)")" && \
cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
python punctuation_capitalization_train_evaluate.py \
pretrained_model=punctuation_en_bert \
model.train_ds.ds_item="${data_dir}" \
model.validation_ds.ds_item="${data_dir}" \
model.test_ds.ds_item="${data_dir}" \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
exp_manager.exp_dir=null && \
rm -rf "${data_dir}"
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/token_classification && \
python token_classification_train.py \
model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
model.dataset.use_cache=false \
model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
exp_manager.exp_dir=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/token_classification/token_classification_evaluate.py \
model.dataset.data_dir=/home/TestData/nlp/ner/ \
model.dataset.use_cache=false \
pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
data_dir="$(mktemp -d -p "$(pwd)")" && \
cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
+do_training=false \
+do_testing=true \
model.test_ds.ds_item="${data_dir}" \
~model.train_ds \
~model.validation_ds \
+model.test_ds.use_cache=false \
pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
rm -rf "${data_dir}"
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/token_classification && \
output_dir="$(mktemp -d -p "$(pwd)")" && \
tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
python punctuation_capitalization_train_evaluate.py \
model.train_ds.use_tarred_dataset=false \
model.train_ds.ds_item="${tmp_data_dir}" \
model.validation_ds.ds_item="${tmp_data_dir}" \
model.test_ds.ds_item="${tmp_data_dir}" \
model.language_model.pretrained_model_name=distilbert-base-uncased \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.accelerator="gpu" \
trainer.strategy=ddp \
trainer.max_epochs=1 \
+exp_manager.explicit_log_dir="${output_dir}" \
+do_testing=true && \
tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
rm -rf "${tmp_data_dir}" && \
python punctuation_capitalization_train_evaluate.py \
model.train_ds.use_tarred_dataset=false \
model.train_ds.ds_item="${tmp_data_dir_2}" \
model.validation_ds.ds_item="${tmp_data_dir_2}" \
model.test_ds.ds_item="${tmp_data_dir_2}" \
pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.accelerator="gpu" \
trainer.strategy=ddp \
trainer.max_epochs=1 \
exp_manager=null && \
rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
"${tmp_data_dir_2}" \
"${output_dir}"
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# Punctuation & Capitalization tarred dataset:
Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
data_dir="$(mktemp -d -p "$(pwd)")" && \
cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
/home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
"${data_dir}"/ && \
usual_data=${data_dir}/wmt_wiki_10000 && \
output_dir="$(mktemp -d -p "$(pwd)")" && \
tarred_data=${output_dir}/train_tarred && \
tokens_in_batch=2000 && \
max_seq_length=512 && \
lm_model=distilbert-base-uncased && \
python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
--text ${usual_data}/input.txt \
--labels ${usual_data}/labels.txt \
--output_dir ${tarred_data} \
--tokens_in_batch ${tokens_in_batch} \
--max_seq_length 512 \
--lines_per_dataset_fragment 2000 \
--num_batches_per_tarfile 5 \
--tar_file_prefix punctuation_capitalization \
--tokenizer_name ${lm_model} \
--use_fast_tokenizer \
--pad_label O \
--n_jobs 3 && \
echo "Number of tarred files in dataset:" && \
ls ${tarred_data}/*.tar | wc -l && \
echo "Label id files in dataset:" && \
ls ${tarred_data}/*.csv && \
metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
model.validation_ds.ds_item="${data_dir}" \
model.test_ds.ds_item="${data_dir}" \
model.train_ds.ds_item=${tarred_data} \
model.language_model.pretrained_model_name=${lm_model} \
model.train_ds.use_tarred_dataset=true \
model.train_ds.tar_metadata_file=${metadata_file} \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.accelerator="gpu" \
trainer.strategy=ddp \
trainer.max_epochs=1 \
+exp_manager.explicit_log_dir=${output_dir}/output && \
rm -rf "${output_dir}" "${data_dir}"
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/token_classification && \
work_dir="$(mktemp -d -p "$(pwd)")" && \
label_vocab_dir="${work_dir}/labels" && \
mkdir -p ${label_vocab_dir} && \
data_dir="${work_dir}/data" && \
mkdir -p "${data_dir}" && \
cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
output_dir="${work_dir}/output" && \
mkdir -p "${output_dir}" && \
punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
printf "O\nU\n" > "${capit_label_vocab}" && \
python punctuation_capitalization_train_evaluate.py \
model.train_ds.use_tarred_dataset=false \
model.train_ds.ds_item="${data_dir}" \
model.validation_ds.ds_item="${data_dir}" \
model.test_ds.ds_item="${data_dir}" \
model.language_model.pretrained_model_name=distilbert-base-uncased \
model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.strategy=ddp \
trainer.max_epochs=1 \
+exp_manager.explicit_log_dir="${output_dir}" \
+do_testing=false && \
python punctuation_capitalization_train_evaluate.py \
+do_training=false \
+do_testing=true \
~model.train_ds \
~model.validation_ds \
model.test_ds.ds_item="${data_dir}" \
pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.strategy=ddp \
trainer.max_epochs=1 \
exp_manager=null && \
rm -rf "${work_dir}"
# TODO: pleasefixme
# Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# cd examples/nlp/token_classification && \
# work_dir="$(mktemp -d -p "$(pwd)")" && \
# output_dir="${work_dir}/output" && \
# mkdir -p "${output_dir}" && \
# data_dir="${work_dir}/data" && \
# mkdir -p "${data_dir}" && \
# cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
# conf_name=punctuation_capitalization_config_with_ids && \
# cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \
# sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \
# "${work_dir}/${conf_name}.yaml" && \
# sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \
# "${work_dir}/${conf_name}.yaml" && \
# python punctuation_capitalization_train_evaluate.py \
# --config-path "${work_dir}" \
# --config-name "${conf_name}" \
# model.train_ds.use_tarred_dataset=false \
# model.train_ds.ds_item="${data_dir}" \
# model.validation_ds.ds_item="${data_dir}" \
# model.test_ds.ds_item="${data_dir}" \
# model.language_model.pretrained_model_name=distilbert-base-uncased \
# +model.train_ds.use_cache=false \
# +model.validation_ds.use_cache=false \
# +model.test_ds.use_cache=false \
# trainer.devices=[0,1] \
# trainer.strategy=ddp \
# trainer.max_epochs=1 \
# +exp_manager.explicit_log_dir="${output_dir}" \
# +do_testing=false && \
# python punctuation_capitalization_train_evaluate.py \
# +do_training=false \
# +do_testing=true \
# ~model.train_ds \
# ~model.validation_ds \
# model.test_ds.ds_item="${data_dir}" \
# pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
# +model.train_ds.use_cache=false \
# +model.validation_ds.use_cache=false \
# +model.test_ds.use_cache=false \
# trainer.devices=[0,1] \
# trainer.strategy=ddp \
# trainer.max_epochs=1 \
# exp_manager=null && \
# rm -rf "${work_dir}"
# Punctuation & Capitalization inference
Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
output_dir="$(mktemp -d -p "$(pwd)")" && \
python examples/nlp/token_classification/punctuate_capitalize_infer.py \
--input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
--output_text "${output_dir}/iwslt_inference_result.txt" \
--max_seq_length 92 \
--step 8 \
--margin 16 \
--pretrained_name punctuation_en_bert \
--batch_size 32 && \
rm -rf "${output_dir}"
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
L2_Pretraining_BERT_pretraining_from_Text:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/language_modeling && \
python bert_pretraining.py \
--config-name=bert_pretraining_from_text_config.yaml \
trainer.devices=1 \
trainer.accelerator="gpu" \
trainer.precision=16 \
+trainer.fast_dev_run=true \
model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \
model.train_ds.batch_size=32 \
model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \
model.validation_ds.batch_size=32 \
model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
model.optim.lr=0.01 \
model.optim.sched.warmup_ratio=0.1 \
model.tokenizer.tokenizer_name=sentencepiece \
model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
model.mask_prob=0.15 \
model.short_seq_prob=0.1 \
exp_manager.exp_dir=PretrainingBERTFromText \
rm -f /home/TestData/nlp/wikitext-2/*.pkl
#rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Pretraining_BERT_from_Preprocessed:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/language_modeling && \
python bert_pretraining.py \
--config-name=bert_pretraining_from_preprocessed_config.yaml \
trainer.devices=1 \
trainer.accelerator="gpu" \
trainer.precision=16 \
+trainer.fast_dev_run=false \
+trainer.max_epochs=1 \
+trainer.limit_val_batches=0 \
+trainer.limit_train_batches=1 \
model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
model.train_ds.batch_size=8 \
model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
model.optim.lr=0.875e-4 \
model.optim.weight_decay=0.01 \
model.optim.sched.warmup_ratio=0.01 \
exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
exp_manager.create_checkpoint_callback=False \
#rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Entity Linking
L2_Entity_Linking_Self_Alignment_Pretraining_BERT:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/entity_linking && \
python self_alignment_pretraining.py \
project_dir=. \
trainer.val_check_interval=3 \
model.raw_data=None \
model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
model.train_ds.batch_size=8 \
model.validation_ds.batch_size=8 \
exp_manager.exp_dir=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
# is in the release container
# L2: NMT Attention is All You Need Training
L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/machine_translation/enc_dec_nmt.py \
--config-path=conf \
--config-name=aayn_base \
do_testing=false \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
model.encoder.num_layers=1 \
model.encoder.hidden_size=64 \
model.encoder.inner_size=256 \
model.decoder.num_layers=1 \
model.decoder.hidden_size=64 \
model.decoder.inner_size=256 \
+model.optim.capturable=True \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.val_check_interval=2 \
+trainer.limit_val_batches=1 \
+trainer.max_steps=2 \
trainer.precision=16 \
+exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+exp_manager.create_checkpoint_callback=true
python examples/nlp/machine_translation/enc_dec_nmt.py \
--config-path=conf \
--config-name=aayn_base \
do_testing=true \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
model.encoder.num_layers=1 \
model.encoder.hidden_size=64 \
model.encoder.inner_size=256 \
model.decoder.num_layers=1 \
model.decoder.hidden_size=64 \
model.decoder.inner_size=256 \
+model.optim.capturable=True \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.val_check_interval=10 \
+trainer.limit_val_batches=1 \
+trainer.limit_test_batches=1 \
+trainer.max_steps=10 \
+exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+exp_manager.create_checkpoint_callback=true \
+exp_manager.resume_if_exists=True
rm -rf examples/nlp/machine_translation/nmt_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/machine_translation && \
python enc_dec_nmt.py \
--config-path=conf \
--config-name=aayn_base \
do_testing=true \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
model.encoder.pre_ln=true \
model.decoder.pre_ln=true \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
+trainer.limit_test_batches=2 \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/machine_translation && \
python enc_dec_nmt.py \
--config-path=conf \
--config-name=aayn_base \
do_testing=true \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
+trainer.limit_test_batches=2 \
exp_manager=null
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: NMT Attention is All You Need Inference
L2_NMT_Attention_is_All_You_Need_Inference:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/machine_translation && \
python nmt_transformer_infer.py \
--model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
--srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
--tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
--target_lang en \
--source_lang de
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: NMT Attention is All You Need Finetuning
L2_NMT_Attention_is_All_You_Need_Finetuning:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/machine_translation && \
python enc_dec_nmt_finetune.py \
model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
trainer.devices=1 \
~trainer.max_epochs \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+trainer.val_check_interval=10 \
+trainer.limit_val_batches=1 \
+trainer.limit_test_batches=1 \
+trainer.max_steps=10 \
+exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
+exp_manager.create_checkpoint_callback=True \
+exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
+exp_manager.checkpoint_callback_params.mode=max \
+exp_manager.checkpoint_callback_params.save_best_model=true
rm -rf examples/nlp/machine_translation/nmt_finetune
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: NMT Tarred Dataset Creation
L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/machine_translation && \
python enc_dec_nmt.py \
--config-path=conf \
--config-name=aayn_base \
do_training=false \
model.preproc_out_dir=$PWD/preproc_out_dir \
model.train_ds.use_tarred_dataset=true \
model.train_ds.n_preproc_jobs=2 \
model.train_ds.lines_per_dataset_fragment=500 \
model.train_ds.num_batches_per_tarfile=10 \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.encoder_tokenizer.vocab_size=2000 \
model.decoder_tokenizer.vocab_size=2000 \
~model.test_ds \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
exp_manager=null \
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
cd examples/nlp/machine_translation && \
python create_tarred_parallel_dataset.py \
--src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
--tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
--out_dir $PWD/out_dir \
--encoder_tokenizer_vocab_size=2000 \
--decoder_tokenizer_vocab_size=2000 \
--tokens_in_batch=1000 \
--lines_per_dataset_fragment=500 \
--num_batches_per_tarfile=10 \
--n_preproc_jobs=2 \
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_NMT_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/machine_translation/megatron_nmt_training.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
+trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation='swiglu' \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method='block' \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation='swiglu' \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method='block' \
model.decoder.activations_checkpoint_num_layers=1 \
model.micro_batch_size=2 \
model.global_batch_size=4 \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.train_ds.num_workers=1 \
model.validation_ds.num_workers=1 \
~model.test_ds \
model.train_ds.dataset_type=text_memmap \
model.encoder_tokenizer.library=sentencepiece \
model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
model.decoder_tokenizer.library=sentencepiece \
model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
# Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
# if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
python examples/nlp/machine_translation/megatron_nmt_training.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
+trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation='swiglu' \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method='block' \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation='swiglu' \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method='block' \
model.decoder.activations_checkpoint_num_layers=1 \
model.micro_batch_size=2 \
model.global_batch_size=4 \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.train_ds.num_workers=1 \
model.validation_ds.num_workers=1 \
~model.test_ds \
model.train_ds.dataset_type=text_memmap \
model.encoder_tokenizer.library=sentencepiece \
model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
model.decoder_tokenizer.library=sentencepiece \
model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
rm -rf examples/nlp/machine_translation/megatron_nmt_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_BART_Perceiver_MIM_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.arch=perceiver \
model.encoder.num_attention_heads=8 \
model.encoder.activation='swiglu' \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method='block' \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation='swiglu' \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method='block' \
model.decoder.activations_checkpoint_num_layers=1 \
model.micro_batch_size=2 \
model.global_batch_size=4 \
model.data.data_impl=text_mmap \
model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
model.data.splits_string='"800,100,100"' \
model.data.whole_word_masking=False \
model.tokenizer.library=sentencepiece \
model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
++model.hiddens.enc_output_name=z \
++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
++model.hiddens.transform.q_z_given_x.hidden_size=64 \
++model.hiddens.loss.mim.cls_name=a_mim \
++model.hiddens.loss.mim.loss_weight=0.5
# Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
# if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
python examples/nlp/language_modeling/megatron_bart_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.arch=perceiver \
model.encoder.num_attention_heads=8 \
model.encoder.activation='swiglu' \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method='block' \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation='swiglu' \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method='block' \
model.decoder.activations_checkpoint_num_layers=1 \
model.micro_batch_size=2 \
model.global_batch_size=4 \
model.data.data_impl=text_mmap \
model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
model.data.splits_string='"800,100,100"' \
model.data.whole_word_masking=False \
model.tokenizer.library=sentencepiece \
model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
++model.hiddens.enc_output_name=z \
++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
++model.hiddens.transform.q_z_given_x.hidden_size=64 \
++model.hiddens.loss.mim.cls_name=a_mim \
++model.hiddens.loss.mim.loss_weight=0.5
rm -rf examples/nlp/language_modeling/megatron_mim_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# stage('L2: NMT Bottleneck Fallback') {
# when {
# anyOf {
# branch 'main'
# changeRequest target: 'main'
# }
# }
# failFast true
# parallel {
# stage('L2: seq2seq (no bottleneck)') {
# steps {
# cd examples/nlp/machine_translation && \
# enc_dec_nmt-bottleneck.py \
# --config-path=conf \
# --config-name=aayn_bottleneck \
# do_testing=true \
# model.model_type=nll \
# model.encoder.arch=seq2seq \
# model.encoder.hidden_steps=1 \
# model.encoder.hidden_blocks=1 \
# model.encoder.hidden_init_method=params \
# model.encoder.hidden_size=64 \
# model.encoder.inner_size=128 \
# model.encoder.num_attention_heads=2 \
# model.encoder.num_layers=2 \
# model.decoder.hidden_size=64 \
# model.decoder.inner_size=128 \
# model.decoder.num_attention_heads=2 \
# model.decoder.num_layers=2 \
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
# model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
# model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=true \
# +trainer.limit_test_batches=2 \
# exp_manager=null \
# }
# }
# }
# }
# stage('L2: NMT Bottleneck Architecture') {
# when {
# anyOf {
# branch 'main'
# changeRequest target: 'main'
# }
# }
# failFast true
# parallel {
# stage('Bridge Encoder (identity)') {
# steps {
# cd examples/nlp/machine_translation && \
# enc_dec_nmt-bottleneck.py \
# --config-path=conf \
# --config-name=aayn_bottleneck \
# do_testing=true \
# model.model_type=nll \
# model.encoder.arch=bridge \
# model.encoder.hidden_steps=1 \
# model.encoder.hidden_blocks=1 \
# model.encoder.hidden_init_method=identity \
# model.encoder.hidden_size=64 \
# model.encoder.inner_size=128 \
# model.encoder.num_attention_heads=2 \
# model.encoder.num_layers=2 \
# model.decoder.hidden_size=64 \
# model.decoder.inner_size=128 \
# model.decoder.num_attention_heads=2 \
# model.decoder.num_layers=2 \
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=true \
# +trainer.limit_test_batches=2 \
# exp_manager=null
# }
# }
# stage('Perceiver Encoder (params)') {
# steps {
# cd examples/nlp/machine_translation && \
# enc_dec_nmt-bottleneck.py \
# --config-path=conf \
# --config-name=aayn_bottleneck \
# do_testing=true \
# model.model_type=nll \
# model.encoder.arch=perceiver \
# model.encoder.hidden_steps=1 \
# model.encoder.hidden_blocks=1 \
# model.encoder.hidden_init_method=params \
# model.encoder.hidden_size=64 \
# model.encoder.inner_size=128 \
# model.encoder.num_attention_heads=2 \
# model.encoder.num_layers=2 \
# model.decoder.hidden_size=64 \
# model.decoder.inner_size=128 \
# model.decoder.num_attention_heads=2 \
# model.decoder.num_layers=2 \
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=true \
# +trainer.limit_test_batches=2 \
# exp_manager=null
# }
# }
# }
# }
# stage('L2: NMT Bottleneck LVM') {
# when {
# anyOf {
# branch 'main'
# changeRequest target: 'main'
# }
# }
# failFast true
# parallel {
# stage('VAE') {
# steps {
# cd examples/nlp/machine_translation && \
# enc_dec_nmt-bottleneck.py \
# --config-path=conf \
# --config-name=aayn_bottleneck \
# do_testing=true \
# model.model_type=vae \
# model.encoder.arch=perceiver \
# model.encoder.hidden_steps=1 \
# model.encoder.hidden_blocks=1 \
# model.encoder.hidden_init_method=params \
# model.encoder.hidden_size=64 \
# model.encoder.inner_size=128 \
# model.encoder.num_attention_heads=2 \
# model.encoder.num_layers=2 \
# model.decoder.hidden_size=64 \
# model.decoder.inner_size=128 \
# model.decoder.num_attention_heads=2 \
# model.decoder.num_layers=2 \
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=true \
# +trainer.limit_test_batches=2 \
# exp_manager=null
# }
# }
# stage('MIM') {
# steps {
# cd examples/nlp/machine_translation && \
# enc_dec_nmt-bottleneck.py \
# --config-path=conf \
# --config-name=aayn_bottleneck \
# do_testing=true \
# model.model_type=mim \
# model.encoder.arch=perceiver \
# model.encoder.hidden_steps=1 \
# model.encoder.hidden_blocks=1 \
# model.encoder.hidden_init_method=params \
# model.encoder.hidden_size=64 \
# model.encoder.inner_size=128 \
# model.encoder.num_attention_heads=2 \
# model.encoder.num_layers=2 \
# model.decoder.hidden_size=64 \
# model.decoder.inner_size=128 \
# model.decoder.num_attention_heads=2 \
# model.decoder.num_layers=2 \
# model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
# model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
# model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=true \
# +trainer.limit_test_batches=2 \
# exp_manager=null
# }
# }
# }
# }
L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
model.pipeline_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=20 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
exp_manager.resume_if_exists=True \
model.pipeline_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
rm -rf examples/nlp/language_modeling/bert_pretrain_results
rm -rf examples/nlp/language_modeling/bert_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_Bert_Pretraining_and_Resume_Training:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.sequence_parallel=True \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=20 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
rm -rf examples/nlp/language_modeling/bert_pretrain_results
rm -rf examples/nlp/language_modeling/bert_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=32 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
model.mcore_bert=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.sequence_parallel=True \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=20 \
trainer.precision=32 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
exp_manager.resume_if_exists=True \
model.mcore_bert=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
rm -rf examples/nlp/language_modeling/bert_pretrain_results
rm -rf examples/nlp/language_modeling/bert_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_RETRO_Pretraining_and_Resume_Training:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_retro_pretraining.py \
trainer.num_nodes=1 \
trainer.devices=2 \
trainer.precision=bf16 \
trainer.accelerator=gpu \
model.data.data_prefix=['none'] \
exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
model.mcore_gpt=True \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
model.data.num_workers=4 \
model.micro_batch_size=1 \
model.data.shuffle_documents=False \
trainer.val_check_interval=30 \
+trainer.num_sanity_val_steps=0 \
model.init_method_std=0.023 \
model.optim.lr=6.0e-4 \
model.megatron_amp_O2=True \
model.data.splits_string=\'\"98,2,0\"\' \
model.data.dataloader_type=cyclic \
trainer.max_steps=10
python examples/nlp/language_modeling/megatron_retro_pretraining.py \
trainer.num_nodes=1 \
trainer.devices=2 \
trainer.precision=bf16 \
trainer.accelerator=gpu \
model.data.data_prefix=['none'] \
exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
model.mcore_gpt=True \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
model.data.num_workers=4 \
model.micro_batch_size=1 \
model.data.shuffle_documents=False \
trainer.val_check_interval=30 \
+trainer.num_sanity_val_steps=0 \
model.init_method_std=0.023 \
model.optim.lr=6.0e-4 \
model.megatron_amp_O2=True \
model.data.splits_string=\'\"98,2,0\"\' \
model.data.dataloader_type=cyclic \
trainer.max_steps=20
rm -rf examples/nlp/language_modeling/mcore_retro_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
trainer.devices=2 \
trainer.num_nodes=1 \
trainer.accelerator=gpu \
trainer.accumulate_grad_batches=1 \
trainer.limit_val_batches=2 \
exp_manager.resume_if_exists=True \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
model.data.data_prefix= \
model.data.knn_index= \
model.data.retrieval_prefix= \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.chunk_size=32 \
model.enc_num_layers=2 \
model.dec_num_layers=2 \
model.enc_cross_attention=[1] \
model.dec_cross_attention=[1] \
+model.data.mock=True
python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
trainer.devices=2 \
trainer.num_nodes=1 \
trainer.accelerator=gpu \
trainer.accumulate_grad_batches=1 \
trainer.limit_val_batches=2 \
exp_manager.resume_if_exists=True \
trainer.max_steps=20 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
model.data.data_prefix= \
model.data.knn_index= \
model.data.retrieval_prefix= \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.chunk_size=32 \
model.enc_num_layers=2 \
model.dec_num_layers=2 \
model.enc_cross_attention=[1] \
model.dec_cross_attention=[1] \
+model.data.mock=True
rm -rf examples/nlp/language_modeling/retro_legacy_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
# trainer.devices=2 \
# trainer.num_nodes=1 \
# trainer.accelerator=gpu \
# trainer.accumulate_grad_batches=1 \
# trainer.max_steps=100 \
# trainer.log_every_n_steps=1 \
# trainer.precision=16 \
# trainer.val_check_interval=100 \
# trainer.limit_val_batches=0 \
# trainer.gradient_clip_val=1.0 \
# +trainer.num_sanity_val_steps=0 \
# exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
# +exp_manager.version=smalltest \
# model.data.neighbors=2 \
# model.megatron_amp_O2=False \
# model.apply_query_key_layer_scaling=False \
# model.tensor_model_parallel_size=1 \
# model.optim.name=muadamw \
# model.optim.weight_decay=0.1 \
# model.optim.betas=[0.9,0.95] \
# model.optim.lr=6e-4 \
# model.optim.sched.warmup_steps=1000 \
# model.optim.sched.constant_steps=0 \
# model.optim.sched.min_lr=6e-5 \
# model.add_position_embedding=False \
# model.enc_num_layers=2 \
# model.dec_num_layers=6 \
# model.enc_cross_attention=[0] \
# model.dec_cross_attention=[3,5] \
# model.hidden_size=96 \
# model.ffn_hidden_size=384 \
# model.init_method_std=0.023 \
# model.num_attention_heads=12 \
# model.max_position_embeddings=1024 \
# model.encoder_seq_length=1024 \
# model.tokenizer.library=megatron \
# model.tokenizer.type=GPT2BPETokenizer \
# model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
# model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
# model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
# model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
# model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
# model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
# model.data.num_workers=8 \
# model.micro_batch_size=8 \
# model.normalization=rmsnorm \
# model.transformer_block_type=pre_ln \
# model.bias_activation_fusion=True \
# model.bias_dropout_add_fusion=False \
# model.masked_softmax_fusion=True \
# model.hidden_dropout=0 \
# model.attention_dropout=0 \
# model.fp32_residual_connection=True \
# model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
# python -c "import pandas as pd
# import pathlib
# from pandas.testing import assert_frame_equal
# from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
# import torch
# if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
# import sys
# sys.exit(0)
# event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
# ea = EventAccumulator(str(event_file)).Reload()
# vals = []
# for i in ea.Scalars('reduced_train_loss'):
# vals.append(i.value)
# training_curve = pd.DataFrame({'loss': vals})
# gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
# assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
# rm -rf examples/nlp/language_modeling/retro_results
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"
L2_BioMegatron_Bert_NER_Task:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/token_classification/token_classification_train.py \
exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
trainer.max_epochs=1 \
model.dataset.data_dir=/home/TestData/nlp/ner \
model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
model.tokenizer.tokenizer_name=null
rm -rf examples/nlp/language_modeling/token_classification_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.normalization=rmsnorm \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.normalization=rmsnorm \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.position_embedding_type=rope \
model.rotary_percentage=0.5 \
model.normalization=rmsnorm \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
# commented out to save time on github ci @adithyare
# python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
# trainer.devices=2 \
# trainer.accelerator=gpu \
# trainer.log_every_n_steps=1 \
# trainer.val_check_interval=2 \
# trainer.limit_val_batches=1 \
# trainer.accumulate_grad_batches=1 \
# trainer.max_steps=6 \
# trainer.precision=16 \
# trainer.gradient_clip_val=1.0 \
# exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
# exp_manager.resume_if_exists=True \
# model.tensor_model_parallel_size=2 \
# model.optim.name=fused_adam \
# model.optim.lr=2e-4 \
# model.optim.sched.warmup_steps=2 \
# model.optim.sched.constant_steps=2 \
# model.optim.sched.min_lr=8e-5 \
# model.max_position_embeddings=128 \
# model.encoder_seq_length=128 \
# model.data.seq_length=128 \
# model.position_embedding_type=rope \
# model.rotary_percentage=0.5 \
# model.normalization=rmsnorm \
# model.bias=False \
# model.bias_activation_fusion=False \
# model.bias_dropout_add_fusion=False \
# model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
# model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
# model.num_layers=8 \
# model.hidden_size=256 \
# model.num_attention_heads=8 \
# model.activations_checkpoint_method=block \
# model.activations_checkpoint_granularity=full \
# model.activations_checkpoint_num_layers=1 \
# model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
# model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# This test requires Ampere but some of the test GPUs are Volta
# Need to add a check for compute capability before uncommenting this test
# - name: L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2
# when {
# anyOf {
# branch main
# changeRequest target: main
# }
# }
# failFast true
# - run: |
# python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
# trainer.devices=2 \
# trainer.accelerator=gpu \
# trainer.log_every_n_steps=1 \
# trainer.val_check_interval=2 \
# trainer.limit_val_batches=2 \
# trainer.accumulate_grad_batches=1 \
# trainer.max_steps=3 \
# trainer.precision=16 \
# trainer.gradient_clip_val=1.0 \
# exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
# model.tensor_model_parallel_size=2 \
# model.optim.name=fused_adam \
# model.optim.lr=2e-4 \
# model.optim.sched.warmup_steps=1 \
# model.optim.sched.constant_steps=1 \
# model.optim.sched.min_lr=8e-5 \
# model.max_position_embeddings=128 \
# model.encoder_seq_length=128 \
# model.data.seq_length=128 \
# model.position_embedding_type=rope \
# model.rotary_percentage=0.5 \
# model.normalization=rmsnorm \
# model.bias=False \
# model.bias_activation_fusion=False \
# model.bias_dropout_add_fusion=False \
# model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
# model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
# model.num_layers=8 \
# model.hidden_size=256 \
# model.num_attention_heads=8 \
# model.activations_checkpoint_method=block \
# model.activations_checkpoint_granularity=full \
# model.activations_checkpoint_num_layers=1 \
# model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
# model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
# model.use_flash_attention=True "
# # commented out to save time on github ci @adithyare
# # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
# # trainer.devices=2 \
# # trainer.accelerator=gpu \
# # trainer.log_every_n_steps=1 \
# # trainer.val_check_interval=2 \
# # trainer.limit_val_batches=1 \
# # trainer.accumulate_grad_batches=1 \
# # trainer.max_steps=6 \
# # trainer.precision=16 \
# # trainer.gradient_clip_val=1.0 \
# # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
# # exp_manager.resume_if_exists=True \
# # model.tensor_model_parallel_size=2 \
# # model.optim.name=fused_adam \
# # model.optim.lr=2e-4 \
# # model.optim.sched.warmup_steps=2 \
# # model.optim.sched.constant_steps=2 \
# # model.optim.sched.min_lr=8e-5 \
# # model.max_position_embeddings=128 \
# # model.encoder_seq_length=128 \
# # model.data.seq_length=128 \
# # model.position_embedding_type=rope \
# # model.rotary_percentage=0.5 \
# # model.normalization=rmsnorm \
# # model.bias=False \
# # model.bias_activation_fusion=False \
# # model.bias_dropout_add_fusion=False \
# # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
# # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
# # model.num_layers=8 \
# # model.hidden_size=256 \
# # model.num_attention_heads=8 \
# # model.activations_checkpoint_method=block \
# # model.activations_checkpoint_granularity=full \
# # model.activations_checkpoint_num_layers=1 \
# # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
# # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
# # model.use_flash_attention=True"
# rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
# rm -rf examples/nlp/language_modeling/gpt_index_mappings"
# }
# }
L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.position_embedding_type=alibi \
model.normalization=rmsnorm \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
# not testing resume functionality to save time on ci @adithyare
#python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
#trainer.devices=2 \
#trainer.accelerator=gpu \
#trainer.log_every_n_steps=1 \
#trainer.val_check_interval=2 \
#trainer.limit_val_batches=1 \
#trainer.accumulate_grad_batches=1 \
#trainer.max_steps=6 \
#trainer.precision=16 \
#trainer.gradient_clip_val=1.0 \
#exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
#exp_manager.resume_if_exists=True \
#model.tensor_model_parallel_size=2 \
#model.optim.name=fused_adam \
#model.optim.lr=2e-4 \
#model.optim.sched.warmup_steps=2 \
#model.optim.sched.constant_steps=2 \
#model.optim.sched.min_lr=8e-5 \
#model.max_position_embeddings=128 \
#model.encoder_seq_length=128 \
#model.data.seq_length=128 \
#model.position_embedding_type=alibi \
#model.normalization=rmsnorm \
#model.bias=False \
#model.bias_activation_fusion=False \
#model.bias_dropout_add_fusion=False \
#model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
#model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
#model.num_layers=8 \
#model.hidden_size=256 \
#model.num_attention_heads=8 \
#model.activations_checkpoint_method=block \
#model.activations_checkpoint_granularity=full \
#model.activations_checkpoint_num_layers=1 \
#model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
#model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.position_embedding_type=kerple \
model.normalization=rmsnorm \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
# commented out to save time on github ci @adithyare
#python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
#trainer.devices=2 \
#trainer.accelerator=gpu \
#trainer.log_every_n_steps=1 \
#trainer.val_check_interval=2 \
#trainer.limit_val_batches=1 \
#trainer.accumulate_grad_batches=1 \
#trainer.max_steps=6 \
#trainer.precision=16 \
#trainer.gradient_clip_val=1.0 \
#exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
#exp_manager.resume_if_exists=True \
#model.tensor_model_parallel_size=2 \
#model.optim.name=fused_adam \
#model.optim.lr=2e-4 \
#model.optim.sched.warmup_steps=2 \
#model.optim.sched.constant_steps=2 \
#model.optim.sched.min_lr=8e-5 \
#model.max_position_embeddings=128 \
#model.encoder_seq_length=128 \
#model.data.seq_length=128 \
#model.position_embedding_type=kerple \
#model.normalization=rmsnorm \
#model.bias=False \
#model.bias_activation_fusion=False \
#model.bias_dropout_add_fusion=False \
#model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
#model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
#model.num_layers=8 \
#model.hidden_size=256 \
#model.num_attention_heads=8 \
#model.activations_checkpoint_method=block \
#model.activations_checkpoint_granularity=full \
#model.activations_checkpoint_num_layers=1 \
#model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
#model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=bf16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.mcore_gpt=True \
model.megatron_amp_O2=True \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.activation=fast-swiglu \
model.bias_activation_fusion=False \
model.hidden_dropout=0.0 \
model.attention_dropout=0.0 \
model.transformer_block_type=normformer \
model.headscale=True \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.precision=bf16 \
trainer.gradient_clip_val=1.0 \
model.mcore_gpt=True \
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.activation=fast-swiglu \
model.bias_activation_fusion=False \
model.hidden_dropout=0.0 \
model.attention_dropout=0.0 \
model.transformer_block_type=normformer \
model.headscale=True \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
#@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0
L2_Megatron_GPT_Finetuning_PP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
+trainer.limit_val_batches=2 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.peft.peft_scheme=null \
model.data.train_ds.micro_batch_size=1 \
model.data.train_ds.global_batch_size=4 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
model.data.train_ds.num_workers=0 \
model.data.test_ds.micro_batch_size=1 \
model.data.test_ds.global_batch_size=1 \
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.test_ds.names=[quarel] \
model.data.validation_ds.micro_batch_size=1 \
model.data.validation_ds.global_batch_size=1 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
+trainer.limit_val_batches=2 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.peft.peft_scheme=null \
model.data.train_ds.micro_batch_size=1 \
model.data.train_ds.global_batch_size=4 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
model.data.train_ds.num_workers=0 \
model.data.test_ds.micro_batch_size=1 \
model.data.test_ds.global_batch_size=1 \
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.test_ds.names=[quarel] \
model.data.validation_ds.micro_batch_size=1 \
model.data.validation_ds.global_batch_size=1 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
rm -rf examples/nlp/language_modeling/gpt_sft_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_Finetuning_StarCoder_PP1:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.precision=32 \
trainer.max_steps=4 \
trainer.val_check_interval=4 \
trainer.enable_checkpointing=False \
+trainer.limit_val_batches=2 \
+trainer.limit_test_batches=2 \
exp_manager.checkpoint_callback_params.save_best_model=False \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
model.peft.peft_scheme=none \
model.optim.name=distributed_fused_adam \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.num_workers=0 \
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.test_ds.num_workers=0 \
model.data.train_ds.concat_sampling_probabilities=[1.0]
rm -rf examples/nlp/language_modeling/gpt_sft_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_Embedding:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
rm -rf /home/TestData/nlp/megatron_ir/working_dir
python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
model.global_batch_size=4 \
model.micro_batch_size=4 \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
model.data.validation_ds.write_embeddings_to_file=True \
model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
trainer.devices=1 \
trainer.num_nodes=1 \
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
model.global_batch_size=4 \
model.micro_batch_size=4 \
model.peft.lora_tuning.adapter_dim=8 \
model.data.test_ds.write_embeddings_to_file=True \
model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
rm -rf /home/TestData/nlp/megatron_ir/working_dir
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_PEFT_Lora_PP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
trainer.max_steps=3 \
trainer.val_check_interval=3 \
++trainer.limit_val_batches=2 \
trainer.precision=16 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_PEFT_Lora_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
rm -rf /home/TestData/nlp/lora_tuning_tp2
python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
trainer.max_steps=3 \
trainer.val_check_interval=3 \
++trainer.limit_val_batches=2 \
trainer.precision=16 \
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
model.peft.peft_scheme='lora' \
model.answer_only_loss=True \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
model.tensor_model_parallel_size=2 \
trainer.devices=2 \
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
model.data.test_ds.names=['quarel4'] \
model.global_batch_size=2 \
model.micro_batch_size=1 \
model.data.test_ds.tokens_to_generate=10 \
model.data.test_ds.write_predictions_to_file=True \
model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
inference.greedy=True \
inference.repetition_penalty=1.0 \
inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
rm -rf /home/TestData/nlp/lora_tuning_tp2
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_Eval:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_eval.py \
gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
prompts=['How to fix GPU memory? A:'] \
tensor_model_parallel_size=1 \
inference.tokens_to_generate=32 \
trainer.precision=32
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_Eval_PP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_eval.py \
gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
server=False \
tensor_model_parallel_size=1 \
pipeline_model_parallel_size=2 \
trainer.devices=2 \
trainer.num_nodes=1 \
trainer.precision=32
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
model.peft.restore_from_path=null \
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \
model.data.test_ds.names=[test] \
model.data.test_ds.global_batch_size=1 \
model.data.test_ds.micro_batch_size=1 \
model.data.test_ds.tokens_to_generate=30 \
model.data.test_ds.max_seq_length=6000 \
model.data.test_ds.write_predictions_to_file=True \
model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \
inference.greedy=True \
inference.repetition_penalty=1.0 \
inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \
rm -rf examples/nlp/language_modeling/out.jsonl
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# TODO: Add this test back. Test was failing on CI machines due to HW error
# - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval
# when {
# anyOf {
# branch main
# changeRequest target: main
# }
# }
# failFast true
# - run: |
# python -m torch.distributed.launch --nproc_per_node=2 \
# examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
# --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \
# --checkpoint_name=model_optim_rng.pt \
# --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \
# --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \
# --model_type=gpt \
# --pipeline_model_parallel_size=1 \
# --gpus_per_node=2 \
# --tensor_model_parallel_size=2"
# python examples/nlp/language_modeling/megatron_gpt_eval.py \
# --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \
# --tokens_to_generate=32 \
# --tensor_model_parallel_size=2 \
# --prompt=This is a test.
# rm examples/nlp/language_modeling/small_gpt.nemo
# L2_Megatron_Change_Partitions
L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_change_num_partitions.py \
--model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
--target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
--tensor_model_parallel_size 2 \
--target_tensor_model_parallel_size 1 \
--pipeline_model_parallel_size 1 \
--target_pipeline_model_parallel_size 2
rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_change_num_partitions.py \
--model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
--target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
--tensor_model_parallel_size 2 \
--target_tensor_model_parallel_size 4 \
--pipeline_model_parallel_size 1 \
--target_pipeline_model_parallel_size 1
rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.position_embedding_type=relative \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=fast-swiglu \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=pre_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False \
model.share_token_embeddings=False \
model.share_decoder_tokens_head_embeddings=False
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.position_embedding_type=relative \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=fast-swiglu \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=pre_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False \
model.share_token_embeddings=False \
model.share_decoder_tokens_head_embeddings=False
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.position_embedding_type=alibi \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=swiglu \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=pre_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False \
model.share_token_embeddings=False \
model.share_decoder_tokens_head_embeddings=False
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.position_embedding_type=alibi \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=swiglu \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=pre_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False \
model.share_token_embeddings=False \
model.share_decoder_tokens_head_embeddings=False
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.position_embedding_type=kerple \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=swiglu \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=pre_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False \
model.share_token_embeddings=False \
model.share_decoder_tokens_head_embeddings=False
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.masked_softmax_fusion=False \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.position_embedding_type=kerple \
model.decoder.num_layers=2 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=swiglu \
model.decoder.masked_softmax_fusion=False \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=pre_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False \
model.share_token_embeddings=False \
model.share_decoder_tokens_head_embeddings=False
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.pipeline_model_parallel_size=2 \
model.pipeline_model_parallel_split_rank=1 \
model.seq_length=256 \
model.encoder.num_layers=4 \
model.decoder.num_layers=1 \
model.encoder.hidden_size=64 \
model.decoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_attention_heads=8 \
model.decoder.ffn_hidden_size=2048 \
model.encoder.activation=gelu \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=post_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.pipeline_model_parallel_size=2 \
model.pipeline_model_parallel_split_rank=1 \
model.seq_length=256 \
model.encoder.num_layers=4 \
model.decoder.num_layers=1 \
model.encoder.hidden_size=64 \
model.decoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_attention_heads=8 \
model.decoder.ffn_hidden_size=2048 \
model.encoder.activation=gelu \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=post_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.pipeline_model_parallel_split_rank=1 \
model.seq_length=256 \
model.encoder.num_layers=4 \
model.decoder.num_layers=1 \
model.encoder.num_moe_experts=4 \
model.decoder.num_moe_experts=4 \
model.encoder.moe_frequency=3 \
model.decoder.moe_frequency=1 \
model.encoder.hidden_size=64 \
model.decoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_attention_heads=8 \
model.decoder.ffn_hidden_size=2048 \
model.encoder.activation=gelu \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=pre_ln \
model.decoder.transformer_block_type=post_ln \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=normformer \
model.encoder.headscale=True \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=geglu \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.decoder.transformer_block_type=normformer \
model.decoder.headscale=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=swiglu \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.encoder.transformer_block_type=normformer \
model.encoder.headscale=True \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=geglu \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.decoder.transformer_block_type=normformer \
model.decoder.headscale=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_Eval:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_eval.py \
--model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
--prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
--tensor_model_parallel_size 1
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation='reglu' \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method='block' \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation='reglu' \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method='block' \
model.decoder.activations_checkpoint_num_layers=1 \
model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
python examples/nlp/language_modeling/megatron_bart_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=5 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation='reglu' \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method='block' \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation='reglu' \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method='block' \
model.decoder.activations_checkpoint_num_layers=1 \
model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
rm -rf examples/nlp/language_modeling/bart_pretrain_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_bart_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
model.pipeline_model_parallel_size=2 \
model.pipeline_model_parallel_split_rank=1 \
model.seq_length=256 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=geglu \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=geglu \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.data.respect_document_boundaries=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
python examples/nlp/language_modeling/megatron_bart_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
exp_manager.resume_if_exists=True \
model.pipeline_model_parallel_size=2 \
model.pipeline_model_parallel_split_rank=1 \
model.seq_length=256 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.encoder.activation=geglu \
model.encoder.bias_activation_fusion=False \
model.encoder.activations_checkpoint_method=block \
model.encoder.activations_checkpoint_num_layers=1 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.decoder.activation=geglu \
model.decoder.bias_activation_fusion=False \
model.decoder.activations_checkpoint_method=block \
model.decoder.activations_checkpoint_num_layers=1 \
model.data.respect_document_boundaries=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
rm -rf examples/nlp/language_modeling/bart_pretrain_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Megatron T5 GLUE/XNLI Finetuning
# TODO(Oktai15): update it in 1.8.0 version
L2_Megatron_T5_GLUE_RTE:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
trainer.devices=1 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
+trainer.limit_val_batches=2 \
+trainer.limit_test_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=2 \
trainer.precision=16 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
model.pipeline_model_parallel_size=1 \
model.pipeline_model_parallel_split_rank=0 \
model.data.train_ds.task_name=rte \
model.data.train_ds.global_batch_size=4 \
model.data.train_ds.micro_batch_size=2 \
model.data.validation_ds.global_batch_size=2 \
model.data.validation_ds.micro_batch_size=2 \
model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
model.data.validation_ds.task_name=rte \
model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
rm -rf examples/nlp/language_modeling/t5_glue_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_GLUE_XNLI:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-cn megatron_t5_config_finetune_glue_xnli \
trainer.devices=1 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
+trainer.limit_val_batches=2 \
+trainer.limit_test_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=2 \
trainer.precision=16 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
model.pipeline_model_parallel_size=1 \
model.pipeline_model_parallel_split_rank=0 \
model.data.train_ds.global_batch_size=4 \
model.data.train_ds.micro_batch_size=2 \
model.data.validation_ds.global_batch_size=2 \
model.data.validation_ds.micro_batch_size=2 \
model.data.test_ds.global_batch_size=2 \
model.data.test_ds.micro_batch_size=2 \
model.data.train_ds.task_name=rte \
model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
model.data.validation_ds.task_name=xnli \
model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
model.data.test_ds.task_name=xnli \
model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
rm -rf examples/nlp/language_modeling/t5_xnli_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_T5_PEFT_Lora_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
trainer.max_steps=3 \
trainer.val_check_interval=3 \
++trainer.limit_val_batches=2 \
trainer.precision=16 \
exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
model.peft.restore_from_ckpt_name=null \
model.peft.restore_from_hparams_path=null \
model.tensor_model_parallel_size=2 \
trainer.devices=2 \
model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
model.data.test_ds.names=[quarel4] \
model.global_batch_size=2 \
model.micro_batch_size=1 \
model.data.test_ds.tokens_to_generate=10 \
model.data.test_ds.write_predictions_to_file=True \
model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \
inference.greedy=True \
inference.repetition_penalty=1.0 \
inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: Megatron Mock Data Generation
L2_Megatron_Mock_Data_Generation_MockGPTDataset:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.max_steps=10 \
trainer.limit_val_batches=7 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.mcore_gpt=True \
model.data.data_impl=mock \
model.data.data_prefix=[]
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_Megatron_Mock_Data_Generation_MockT5Dataset:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.max_steps=10 \
trainer.limit_val_batches=3 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.data.data_impl=mock \
model.data.data_prefix=[]
rm -rf examples/nlp/language_modeling/t5_pretrain_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: TTS Fast dev runs 1
L2_TTS_Fast_dev_runs_1_Tacotron_2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure-gpus-1
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/tts/tacotron2.py \
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
trainer.strategy=auto \
model.decoder.decoder_rnn_dim=256 \
model.decoder.attention_rnn_dim=1024 \
model.decoder.prenet_dim=128 \
model.postnet.postnet_n_convolutions=3 \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
model.validation_ds.dataloader_params.num_workers=0 \
~model.text_normalizer \
~model.text_normalizer_call_kwargs \
~trainer.check_val_every_n_epoch
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_TTS_Fast_dev_runs_1_WaveGlow:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/tts/waveglow.py \
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
trainer.strategy=auto \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
model.validation_ds.dataloader_params.num_workers=0 \
model.waveglow.n_flows=4 \
model.waveglow.n_wn_layers=2 \
model.waveglow.n_wn_channels=32 \
~trainer.check_val_every_n_epoch
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_TTS_Fast_dev_runs_1_FastPitch:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/tts/fastpitch.py \
--config-name fastpitch_align_v1.05 \
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
sup_data_path=/home/TestData/an4_dataset/beta_priors \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
trainer.max_epochs=1 \
trainer.strategy=auto \
model.pitch_mean=212.35873413085938 \
model.pitch_std=68.52806091308594 \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
model.validation_ds.dataloader_params.num_workers=0 \
model.symbols_embedding_dim=64 \
model.input_fft.d_inner=384 \
model.input_fft.n_layer=2 \
model.output_fft.d_inner=384 \
model.output_fft.n_layer=2 \
~trainer.check_val_every_n_epoch \
~model.text_normalizer \
~model.text_normalizer_call_kwargs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/tts/radtts.py \
# train_dataset=/home/TestData/an4_dataset/an4_train.json \
# validation_datasets=/home/TestData/an4_dataset/an4_val.json \
# sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
# trainer.devices="[0]" \
# +trainer.limit_train_batches=1 \
# +trainer.limit_val_batches=1 \
# trainer.max_epochs=1 \
# trainer.strategy=auto \
# model.pitch_mean=212.35873413085938 \
# model.pitch_std=68.52806091308594 \
# model.train_ds.dataloader_params.batch_size=4 \
# model.train_ds.dataloader_params.num_workers=0 \
# model.validation_ds.dataloader_params.batch_size=4 \
# model.validation_ds.dataloader_params.num_workers=0 \
# export_dir=/home/TestData/radtts_test \
# model.optim.lr=0.0001 \
# model.modelConfig.decoder_use_partial_padding=True \
# ~trainer.check_val_every_n_epoch \
# ~model.text_normalizer \
# ~model.text_normalizer_call_kwargs
# #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# # if: "failure()"
L2_TTS_Fast_dev_runs_1_Mixer-TTS:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/tts/mixer_tts.py \
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
sup_data_path=/home/TestData/an4_dataset/sup_data \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
trainer.max_epochs=1 \
trainer.strategy=auto \
model.pitch_mean=212.35873413085938 \
model.pitch_std=68.52806091308594 \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
model.validation_ds.dataloader_params.num_workers=0 \
~trainer.check_val_every_n_epoch \
~model.text_normalizer \
~model.text_normalizer_call_kwargs
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
L2_TTS_Fast_dev_runs_1_Hifigan:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/tts/hifigan.py \
train_dataset=/home/TestData/an4_dataset/an4_train.json \
validation_datasets=/home/TestData/an4_dataset/an4_val.json \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
+trainer.max_epochs=1 \
trainer.strategy=auto \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
model.validation_ds.dataloader_params.num_workers=0 \
model.generator.upsample_initial_channel=64 \
+model.debug=true \
~trainer.check_val_every_n_epoch
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# L2: NeRF
# L2_NeRF_DreamFusion:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/multimodal/text_to_image/nerf/main.py \
# trainer.num_nodes=1 \
# trainer.devices="[0]" \
# trainer.max_steps=1000 \
# model.prompt="a DSLR photo of a delicious hamburger" \
# exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results
#
# rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"
Speech_Checkpoints_tests:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 20
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
pretrained_name=QuartzNet15x5Base-En \
dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
batch_size=64 \
tolerance=0.1012
rm -f examples/asr/evaluation_transcripts.json
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
Nemo_CICD_Test:
needs:
- L0_Unit_Tests_GPU
- L0_Unit_Tests_CPU
- L2_Community_LLM_Checkpoints_tests_Llama
- L2_Community_LLM_Checkpoints_tests_StarCoder
- L2_Community_LLM_Checkpoints_tests_Falcon
#- L2_Community_LLM_Checkpoints_tests_Baichuan2
- ASR_dev_run_Speech_to_Text
- ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
- ASR_dev_run_Speech_Pre-training_-_CitriNet
- ASR_dev_run_Speech_To_Text_Finetuning
#- OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning
- ASR_dev_run_Speech_to_Text_WPE_-_Conformer
- ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
- L2_Speech_to_Text_EMA
- L2_Speaker_dev_run_Speaker_Recognition
- L2_Speaker_dev_run_Speaker_Diarization
- L2_Speaker_dev_run_Speech_to_Label
- L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
- L2_Speaker_dev_run_Clustering_Diarizer_Inference
- L2_Speaker_dev_run_Neural_Diarizer_Inference
- L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
- L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
- L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
- L2_ASR_Adapters_Linear_Adapters
- L2_ASR_Adapters_RelPos_MHA_Adapters
- L2_Speech_Transcription_Speech_to_Text_Transcribe
- L2_Transducer_alignment_Running_pytest
- L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
- L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
- L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
- L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
- L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA
- L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel
- L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel
- L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel
- L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier
- L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel
- L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel
- L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel
- L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel
- L2_Duplex_Text_Normalization_with_Tarred_dataset
- L2_BERT_Text_Classification_with_BERT_Test
- L2_Parallel_BERT_Question-Answering_SQUAD_v1_1
- L2_Parallel_BERT_Question-Answering_SQUAD_v2_0
- L2_Parallel_BART_Question-Answering_SQUAD_v1_1
- L2_Parallel_BART_Question-Answering_SQUAD_v2_0
- L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1
- L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0
- L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
- L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
- L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test
- L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test
- L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1
- L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification
- L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation
- L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data
- Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset
- Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir
- Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text
- L2_Pretraining_BERT_pretraining_from_Text
- L2_Pretraining_BERT_from_Preprocessed
- L2_Entity_Linking_Self_Alignment_Pretraining_BERT
- L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN
- L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN
- L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation
- L2_NMT_Attention_is_All_You_Need_Inference
- L2_NMT_Attention_is_All_You_Need_Finetuning
- L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation
- L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation
- L2_Megatron_NMT_Training_TP2
- L2_Megatron_BART_Perceiver_MIM_Training_TP2
- L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism
- L2_Megatron_Bert_Pretraining_and_Resume_Training
- L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
- L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
- L2_Megatron_RETRO_Pretraining_and_Resume_Training
- L2_BioMegatron_Bert_NER_Task
- L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
- L2_Megatron_GPT_Finetuning_PP2
- L2_Megatron_GPT_Finetuning_StarCoder_PP1
- L2_Megatron_GPT_Embedding
- L2_Megatron_GPT_PEFT_Lora_PP2
- L2_Megatron_GPT_PEFT_Lora_TP2
- L2_Megatron_GPT_Eval
- L2_Megatron_GPT_Eval_PP2
- L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
- L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
- L2_Megatron_T5_w_Mixture_of_Expert_Pretraining
- L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_Eval
- L2_Megatron_BART_Pretraining_and_Resume_Training_TP2
- L2_Megatron_BART_Pretraining_and_Resume_Training_PP2
- L2_Megatron_T5_GLUE_RTE
- L2_Megatron_T5_GLUE_XNLI
- L2_Megatron_T5_PEFT_Lora_TP2
- L2_Megatron_Mock_Data_Generation_MockGPTDataset
- L2_Megatron_Mock_Data_Generation_MockT5Dataset
- L2_TTS_Fast_dev_runs_1_Tacotron_2
- L2_TTS_Fast_dev_runs_1_WaveGlow
- L2_TTS_Fast_dev_runs_1_FastPitch
#- OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS
- L2_TTS_Fast_dev_runs_1_Mixer-TTS
- L2_TTS_Fast_dev_runs_1_Hifigan
- Speech_Checkpoints_tests
runs-on: ubuntu-latest
steps:
# This should depend on all the tests so we block/unblock based on all tests passing
- run: exit 0