forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
- Loading branch information
Showing
1 changed file
with
355 additions
and
0 deletions.
There are no files selected for viewing
355 changes: 355 additions & 0 deletions
355
examples/multimodal/speech_llm/conf/modular_audio_gpt_config_cross_llama_lhotse_multi.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,355 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
# This configuration is similar to modular_audio_gpt_config_cross_llama_lhotse.yaml, | ||
# with the difference being in how it performs multimodal sampling. | ||
# The changes are in model.data.train_ds section. | ||
# You'll notice that it defines two sub-sections: audio and text. | ||
# Their names are arbitrary in the sense that you may define more subsections as you like, also with repeated modalities. | ||
# We still set up a single dataloader, but each sub-section produces its own sampler with its own batch size related settings. | ||
# That means each sub-section may decide about its own static/dynamic batch sizes, bucketing, etc. | ||
# These different samplers are later combined into a single sampler using one of three available sampler fusion strategies: | ||
# round_robin (taking turns), randomized_round_robin (at each step select a sampler according to weights), | ||
# or zip (sample mini-batch from each and combine them). | ||
name: megatron_audio_gpt_bestow_lhotse_multi_sampler | ||
|
||
trainer: | ||
devices: 1 | ||
accelerator: gpu | ||
num_nodes: 1 | ||
precision: 16 | ||
logger: False # logger provided by exp_manager | ||
enable_checkpointing: False | ||
use_distributed_sampler: False | ||
max_epochs: 9999 | ||
max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches | ||
limit_train_batches : 1000 | ||
log_every_n_steps: 10 # frequency with which training steps are logged | ||
val_check_interval: 1000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch | ||
gradient_clip_val: 1.0 | ||
accumulate_grad_batches: 1 | ||
|
||
model_target: nemo.collections.multimodal.speech_llm.models.modular_models.CrossAttendModularAudioGPTModel | ||
|
||
exp_manager: | ||
# explicit_log_dir: null | ||
exp_dir: null | ||
name: ${name} | ||
create_wandb_logger: False | ||
wandb_logger_kwargs: | ||
project: null | ||
name: null | ||
resume_if_exists: True | ||
resume_ignore_no_checkpoint: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: validation_${model.data.validation_ds.metric.name} | ||
save_top_k: 1 | ||
mode: min | ||
save_nemo_on_train_end: True | ||
filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}' | ||
model_parallel_size: ${model.tensor_model_parallel_size} | ||
always_save_nemo: False | ||
save_best_model: True | ||
create_early_stopping_callback: False | ||
early_stopping_callback_params: | ||
monitor: "val_loss" | ||
mode: "min" | ||
min_delta: 0.001 | ||
patience: 10 | ||
verbose: True | ||
strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. | ||
|
||
|
||
model: | ||
seed: 1234 | ||
tensor_model_parallel_size: 1 # intra-layer model parallelism | ||
pipeline_model_parallel_size: 1 # inter-layer model parallelism | ||
|
||
pretrained_audio_model: stt_en_fastconformer_transducer_large | ||
freeze_llm: True | ||
freeze_audio_encoder: False | ||
freeze_modality_adapter: False | ||
load_audio_encoder: True | ||
|
||
global_batch_size: 128 | ||
micro_batch_size: 4 | ||
restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with | ||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | ||
save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. | ||
sync_batch_comm: False | ||
megatron_amp_O2: False | ||
|
||
## Sequence Parallelism | ||
# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially | ||
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. | ||
sequence_parallel: False | ||
|
||
## Activation Checkpoint | ||
activations_checkpoint_granularity: null # 'selective' or 'full' | ||
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' | ||
# 'uniform' divides the total number of transformer layers and checkpoints the input activation | ||
# of each chunk at the specified granularity | ||
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity | ||
activations_checkpoint_num_layers: null # not used with 'selective' | ||
activations_checkpoint_layers_per_pipeline: null | ||
answer_only_loss: True | ||
gradient_as_bucket_view: False | ||
|
||
hidden_dropout: 0.0 | ||
attention_dropout: 0.0 | ||
ffn_dropout: 0.0 | ||
|
||
# use_am_tokenizer: True | ||
# override_vocab_size: 1024 | ||
|
||
peft: | ||
peft_scheme: "lora" # can be either lora, adapter, ia3 or ptuning | ||
restore_from_path: null | ||
|
||
# Used for adapter peft training | ||
adapter_tuning: | ||
type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' | ||
adapter_dim: 32 | ||
adapter_dropout: 0.0 | ||
norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. | ||
column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal | ||
row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal | ||
norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] | ||
layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers | ||
weight_tying: False | ||
position_embedding_strategy: null # used only when weight_tying is True | ||
|
||
lora_tuning: | ||
target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) | ||
adapter_dim: 32 | ||
alpha: ${model.peft.lora_tuning.adapter_dim} | ||
adapter_dropout: 0.0 | ||
column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal | ||
row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal | ||
layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers | ||
weight_tying: False | ||
position_embedding_strategy: null # used only when weight_tying is True | ||
|
||
# Used for p-tuning peft training | ||
p_tuning: | ||
virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence | ||
bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck | ||
embedding_dim: 1024 # the size of the prompt encoder embeddings | ||
init_std: 0.023 | ||
|
||
perception: | ||
target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule | ||
use_multi_layer_feat: false | ||
xattn: | ||
target: nemo.collections.multimodal.speech_llm.modules.perception_modules.TransformerCrossAttention | ||
num_attention_heads: 8 | ||
attn_score_dropout: 0.1 | ||
attn_layer_dropout: 0.1 | ||
ffn_dropout: 0.1 | ||
hidden_act: "relu" | ||
pre_ln: true | ||
pre_ln_final_layer_norm: true | ||
|
||
multi_layer_feat: | ||
layer_idx_list: [0,16] # layer indices to extract features from | ||
aggregator: | ||
mode: "cat" # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat') | ||
pooling: "avg" # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min'] | ||
align_mode: "min" # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest. | ||
|
||
modality_adapter: | ||
_target_: nemo.collections.asr.modules.ConformerEncoder | ||
feat_in: 1024 | ||
feat_out: -1 # you may set it if you need different output size other than the default d_model | ||
n_layers: 2 | ||
d_model: 512 | ||
|
||
# Sub-sampling parameters | ||
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding | ||
subsampling_factor: 8 # must be power of 2 for striding and vggnet | ||
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model | ||
causal_downsampling: false | ||
|
||
# Reduction parameters: Can be used to add another subsampling layer at a given position. | ||
# Having a 2x reduction will speedup the training and inference speech while keeping similar WER. | ||
# Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. | ||
reduction: null # pooling, striding, or null | ||
reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder | ||
reduction_factor: 1 | ||
|
||
# Feed forward module's params | ||
ff_expansion_factor: 4 | ||
|
||
# Multi-headed Attention Module's params | ||
self_attention_model: rel_pos # rel_pos or abs_pos | ||
n_heads: 8 # may need to be lower for smaller d_models | ||
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention | ||
att_context_size: [-1, -1] # -1 means unlimited context | ||
att_context_style: regular # regular or chunked_limited | ||
xscaling: true # scales up the input embeddings by sqrt(d_model) | ||
untie_biases: true # unties the biases of the TransformerXL layers | ||
pos_emb_max_len: 5000 | ||
|
||
# Convolution module's params | ||
conv_kernel_size: 9 | ||
conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) | ||
# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size | ||
# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] | ||
conv_context_size: null | ||
|
||
### regularization | ||
dropout: 0.1 # The dropout used in most of the Conformer Modules | ||
dropout_pre_encoder: 0.1 # The dropout used before the encoder | ||
dropout_emb: 0.0 # The dropout used for embeddings | ||
dropout_att: 0.1 # The dropout for multi-headed attention modules | ||
|
||
# set to non-zero to enable stochastic depth | ||
stochastic_depth_drop_prob: 0.0 | ||
stochastic_depth_mode: linear # linear or uniform | ||
stochastic_depth_start_layer: 1 | ||
|
||
spec_augment: | ||
_target_: nemo.collections.asr.modules.SpectrogramAugmentation | ||
freq_masks: 2 # set to zero to disable it | ||
time_masks: 10 # set to zero to disable it | ||
freq_width: 27 | ||
time_width: 0.05 | ||
|
||
# the following are read from the pretrained AM: | ||
# output_dim: null | ||
# encoder: null | ||
# preprocessor: null | ||
|
||
data: | ||
end_string: "[EOG]" | ||
train_ds: | ||
use_lhotse: true | ||
multi_config: true | ||
audio: | ||
input_cfg: ??? | ||
sampler_fusion: round_robin | ||
seed: 0 | ||
shard_seed: "trng" | ||
batch_size: null | ||
batch_duration: 360 | ||
quadratic_factor: 15 | ||
use_bucketing: true | ||
num_buckets: 30 | ||
bucket_buffer_size: 20000 | ||
num_workers: 4 | ||
text: | ||
input_cfg: ??? | ||
use_multimodal_sampling: true | ||
batch_tokens: 8000 | ||
quadratic_factor: 192 | ||
use_bucketing: true | ||
num_buckets: 30 | ||
bucket_buffer_size: 20000 | ||
num_workers: 4 | ||
|
||
global_batch_size: ${model.global_batch_size} | ||
micro_batch_size: ${model.micro_batch_size} | ||
max_seq_length: 2048 | ||
min_seq_length: 1 | ||
context_key: 'input' | ||
label_key: 'output' | ||
add_eos: True | ||
# add_eos: False | ||
end_string: ${model.data.end_string} | ||
add_sep: False | ||
add_bos: False | ||
separate_prompt_and_response_with_newline: False | ||
truncation_field: "context" # Options: ['context', 'answer'] | ||
index_mapping_dir: null # Path to a directory to write index mapping files. | ||
prompt_template: "[INST]\n<<SYS>>\nPlease answer the following based on the previous speech feature.\n<</SYS>>\n\n{input}[/INST] {output}" | ||
|
||
validation_ds: | ||
manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. | ||
global_batch_size: ${model.global_batch_size} | ||
micro_batch_size: ${model.micro_batch_size} | ||
shuffle: False | ||
num_workers: 0 | ||
pin_memory: True | ||
max_seq_length: 2048 | ||
min_seq_length: 1 | ||
drop_last: False | ||
context_key: ${model.data.train_ds.context_key} | ||
label_key: ${model.data.train_ds.label_key} | ||
add_eos: ${model.data.train_ds.add_eos} | ||
end_string: ${model.data.end_string} | ||
add_sep: ${model.data.train_ds.add_sep} | ||
add_bos: ${model.data.train_ds.add_bos} | ||
separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline} | ||
write_predictions_to_file: False | ||
output_file_path_prefix: null # Prefix of the file to write predictions to. | ||
truncation_field: "context" # Options: ['context', 'answer'] | ||
index_mapping_dir: null # Path to a directory to write index mapping files. | ||
prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" | ||
tokens_to_generate: 128 | ||
# ASR configs | ||
sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate} | ||
|
||
log_every_n_steps: 10 | ||
metric: | ||
name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] | ||
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. | ||
num_classes: null | ||
|
||
# test_ds: | ||
# manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. | ||
# names: null # Names of the corresponding datasets used to log metrics. | ||
# global_batch_size: ${model.global_batch_size} | ||
# micro_batch_size: ${model.micro_batch_size} | ||
# shuffle: False | ||
# num_workers: 4 | ||
# pin_memory: True | ||
# max_seq_length: 2048 | ||
# min_seq_length: 1 | ||
# drop_last: False | ||
# context_key: 'input' | ||
# label_key: 'output' | ||
# add_eos: ${model.data.train_ds.add_eos} | ||
# end_string: ${model.data.end_string} | ||
# add_sep: ${model.data.train_ds.add_sep} | ||
# add_bos: ${model.data.train_ds.add_bos} | ||
# separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline} | ||
# write_predictions_to_file: False | ||
# output_file_path_prefix: null # Prefix of the file to write predictions to. | ||
# truncation_field: "context" # Options: ['context', 'answer'] | ||
# index_mapping_dir: null # Path to a directory to write index mapping files. | ||
# prompt_template: ${model.data.train_ds.prompt_template} | ||
# # ASR configs | ||
# sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate} | ||
|
||
# metric: | ||
# name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] | ||
# average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. | ||
# num_classes: null | ||
|
||
optim: | ||
name: fused_adam | ||
lr: 1e-4 | ||
weight_decay: 0.01 | ||
betas: | ||
- 0.9 | ||
- 0.98 | ||
sched: | ||
name: CosineAnnealing | ||
warmup_steps: 50 | ||
min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 | ||
constant_steps: 0 # Constant steps should also be 0 when min_lr=0 | ||
monitor: val_loss | ||
reduce_on_plateau: false |