Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update NeMo Clip to Use MCore Modules #9594

Merged
merged 22 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions examples/multimodal/convert_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,6 @@ def convert(local_rank, rank, world_size, args):
model = MegatronControlNet.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
elif args.model_type == 'kosmos':
model = MegatronKosmosModel.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
elif args.model_type == 'neva':
model = MegatronNevaModel.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
else:
raise ValueError(f"Unrecognized model_type {args.model_type}.")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,50 @@
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 32
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
check_val_every_n_epoch: null
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False
enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually

exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_clip
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
resume_from_checkpoint: ${model.resume_from_checkpoint}
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
ema:
enable: False
decay: 0.9999
validate_original_weights: False
every_n_steps: 1
cpu_offload: False

model:
precision: 32
# specify micro_batch_size, global_batch_size, and model parallelism
Expand All @@ -19,6 +66,9 @@ model:
local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue

mcore_gpt: False
transformer_engine: False

vision:
precision: 32
# vision configs
Expand Down Expand Up @@ -135,7 +185,6 @@ model:
bias_activation_fusion: False
megatron_legacy: True

transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ model:
# numerical results as the naïve method.
local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
mcore_gpt: True
transformer_engine: True

vision:
precision: ${trainer.precision}
Expand Down Expand Up @@ -183,7 +185,6 @@ model:
bias_activation_fusion: False
megatron_legacy: False

transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ trainer:
num_nodes: 1
accelerator: gpu
logger: False # logger provided by exp_manager
precision: 16 # 16, 32, or bf16
precision: 32 # 16, 32, or bf16

model:
restore_from_path: null # Path to a trained ViT .nemo file
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 32
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
check_val_every_n_epoch: null
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False
enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually

exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_clip
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
resume_from_checkpoint: ${model.resume_from_checkpoint}
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
ema:
enable: False
decay: 0.9999
validate_original_weights: False
every_n_steps: 1
cpu_offload: False

model:
precision: 32
# specify micro_batch_size, global_batch_size, and model parallelism
# gradient accumulation will be done automatically based on data_parallel_size
micro_batch_size: 1 # limited by GPU memory
global_batch_size: 1 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
virtual_pipeline_model_parallel_size: null # interleaved pipeline

restore_from_pretrained: null # used in fine-tuning
# multimodal configs
output_dim: 1152
# As the number of devices used to train increases, so does the space complexity of
# the logit matrix. Using a naïve all-gather scheme, space complexity will be
# `O(n^2)`. Instead, complexity may become effectively linear if the flags
# `--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one
# numerical results as the naïve method.

use_siglip: True
mcore_gpt: True
transformer_engine: True

vision:
precision: 32
# vision configs
patch_dim: 14
img_h: 378
img_w: 378
image_mean: null
image_std: null
num_channels: 3
drop_patch_rate: 0.0
drop_path_rate: 0.0
global_average_pool: False
output_dim: ${model.output_dim}
class_token_length: 0
preprocess_layernorm: True # apply layer norm to embedded tokens

# model architecture
encoder_seq_length: 196
max_position_embeddings: ${.encoder_seq_length}
position_embedding_type: learned_absolute
num_layers: 27
hidden_size: 1152
ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 16
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
use_scaled_init_method: True # use scaled residuals initialization
hidden_dropout: 0. # Dropout probability for hidden state transformer.
attention_dropout: 0.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
normalization: layernorm # Type of normalization layers
layernorm_epsilon: 1e-5
do_layer_norm_weight_decay: False # True means weight decay on all params
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.

## Activation Checkpointing
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: null # not used with 'selective'
sequence_parallel: False

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# model fusions
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.

use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
openai_gelu: True
bias_activation_fusion: False
megatron_legacy: True
activation: approx-gelu



text:
precision: 32
# text configs
output_dim: ${model.output_dim}

# model architecture
encoder_seq_length: 64
max_position_embeddings: ${.encoder_seq_length}
position_embedding_type: learned_absolute
num_layers: 27
hidden_size: 1152
ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 16
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
use_scaled_init_method: True # use scaled residuals initialization
hidden_dropout: 0. # Dropout probability for hidden state transformer.
attention_dropout: 0.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
normalization: layernorm # Type of normalization layers
layernorm_epsilon: 1e-5
do_layer_norm_weight_decay: False # True means weight decay on all params
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.

## Activation Checkpointing
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: null # not used with 'selective'
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: False

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# model fusions
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.

use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
openai_gelu: True
bias_activation_fusion: False
megatron_legacy: True

fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
activation: approx-gelu

# Megatron O2-style half-precision
megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce

# miscellaneous
seed: 1234
resume_from_checkpoint: null # manually set the checkpoint file to load from
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

tokenizer:
library: 'huggingface'
type: 'google/siglip-so400m-patch14-384'
model: null
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.

data:
num_workers: 8
train:
dataset_path: # List of paths to pkl files or tar files
- /datasets/coyo/test.pkl
validation: # List of paths to pkl files or tar files
dataset_path:
- /datasets/coyo/test.pkl
webdataset:
infinite_sampler: False
local_root_path: /datasets/coyo

imagenet_val: null # Path to imagenet val set for conducting zero shot evaluation.

# Nsys profiling options
nsys_profile:
enabled: False
start_step: 10 # Global batch to start profiling
end_step: 10 # Global batch to end profiling
ranks: [ 0 ] # Global rank IDs to profile
gen_shape: False # Generate model and kernel details including input shapes

optim:
name: fused_adam
lr: 1e-3
weight_decay: 0.2
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 2000
constant_steps: 0
min_lr: 1e-5
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ def convert(local_rank, rank, world_size, args):


if __name__ == '__main__':
logging.warning("This script is going to be deprecated soon. Please use ")
args = get_args()
local_rank, rank, world_size = initialize_distributed(args)
convert(local_rank, rank, world_size, args)
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,17 @@
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager

mp.set_start_method("spawn", force=True)


@hydra_runner(config_path="conf", config_name="megatron_clip_config")
def main(cfg) -> None:
logging.info("\n\n************** Experiment configuration ***********")
logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

assert (
cfg.trainer.devices * cfg.trainer.num_nodes
cfg.trainer.devices
* cfg.trainer.num_nodes
// cfg.model.tensor_model_parallel_size
// cfg.model.pipeline_model_parallel_size
) * cfg.model.micro_batch_size == cfg.model.global_batch_size, (
"Gradient accumulation is not supported in CLIP yet."
)
Expand Down
Loading
Loading