diff --git a/Jenkinsfile b/Jenkinsfile index 474db51efdc8..c2357e280afb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -126,72 +126,74 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat' } } -// -// stage('L2: Multimodal Imagen Train') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/imagen_train" -// sh "pip install webdataset==0.2.48" -// sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \ -// trainer.precision=16 \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=1 \ -// model.global_batch_size=1 \ -// model.data.synthetic_data=True \ -// exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \ -// model.inductor=False \ -// model.unet.flash_attention=False \ -// " -// sh "pip install 'webdataset>=0.1.48,<=0.1.62'" -// sh "rm -rf /home/TestData/multimodal/imagen_train" -// } -// } -// -// stage('L2: Multimodal Stable Diffusion Train') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" -// sh "pip install webdataset==0.2.48" -// sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ -// trainer.precision=16 \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=1 \ -// model.global_batch_size=1 \ -// model.data.synthetic_data=True \ -// exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \ -// model.inductor=False \ -// model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ -// ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ -// ++model.cond_stage_config.max_length=77 \ -// ~model.cond_stage_config.restore_from_path \ -// ~model.cond_stage_config.freeze \ -// ~model.cond_stage_config.layer \ -// model.unet_config.from_pretrained=null \ -// model.first_stage_config.from_pretrained=null \ -// model.unet_config.use_flash_attention=False \ -// " -// sh "pip install 'webdataset>=0.1.48,<=0.1.62'" -// sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" -// } -// } + + stage('L2: Multimodal Imagen Train') { + when { + anyOf { + branch 'r1.23.0' + changeRequest target: 'r1.23.0' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/imagen_train" + sh "pip install webdataset==0.2.48" + sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \ + trainer.precision=16 \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + trainer.max_steps=20 \ + model.conditioning.embed_dim=64 \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.synthetic_data=True \ + exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \ + model.inductor=False \ + model.unet.flash_attention=False \ + " + sh "pip install 'webdataset>=0.1.48,<=0.1.62'" + sh "rm -rf /home/TestData/multimodal/imagen_train" + } + } + stage('L2: Multimodal Stable Diffusion Train') { + when { + anyOf { + branch 'r1.23.0' + changeRequest target: 'r1.23.0' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" + sh "pip install webdataset==0.2.48" + sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ + trainer.precision=16 \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + trainer.max_steps=20 \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.synthetic_data=True \ + exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \ + model.inductor=False \ + model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ + ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ + ++model.cond_stage_config.max_length=77 \ + ~model.cond_stage_config.restore_from_path \ + ~model.cond_stage_config.freeze \ + ~model.cond_stage_config.layer \ + model.unet_config.from_pretrained=null \ + model.first_stage_config.from_pretrained=null \ + model.unet_config.use_flash_attention=False \ + model.unet_config.attention_resolutions=[1] \ + model.unet_config.channel_mult=[1] \ + " + sh "pip install 'webdataset>=0.1.48,<=0.1.62'" + sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" + } + } // stage('L2: Multimodal ControlNet Train') { // when { // anyOf { @@ -260,122 +262,122 @@ pipeline { // sh "rm -rf /home/TestData/multimodal/dreambooth_train" // } // } -// stage('L2: Vision ViT Pretrain TP=1') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/vision/vit_pretrain_tp1" -// sh "pip install webdataset==0.2.48" -// sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \ -// trainer.precision=16 \ -// model.megatron_amp_O2=False \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// trainer.val_check_interval=5 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=2 \ -// model.global_batch_size=4 \ -// model.tensor_model_parallel_size=1 \ -// model.pipeline_model_parallel_size=1 \ -// model.data.num_workers=0 \ -// exp_manager.create_checkpoint_callback=False \ -// model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \ -// exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 " -// sh "pip install 'webdataset>=0.1.48,<=0.1.62'" -// sh "rm -rf /home/TestData/vision/vit_pretrain_tp1" -// } -// } -// -// stage('L2: Multimodal CLIP Pretrain TP=1') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1" -// sh "pip install webdataset==0.2.48" -// sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py \ -// trainer.precision=16 \ -// model.megatron_amp_O2=False \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// trainer.val_check_interval=10 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=1 \ -// model.global_batch_size=1 \ -// model.tensor_model_parallel_size=1 \ -// model.pipeline_model_parallel_size=1 \ -// exp_manager.create_checkpoint_callback=False \ -// model.data.num_workers=0 \ -// model.vision.num_layers=2 \ -// model.text.num_layers=2 \ -// model.vision.patch_dim=32 \ -// model.vision.encoder_seq_length=49 \ -// model.vision.class_token_length=7 \ -// model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \ -// model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \ -// model.data.webdataset.local_root_path=/ \ -// exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 " -// sh "pip install 'webdataset>=0.1.48,<=0.1.62'" -// sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1" -// } -// } -// -// stage('L2: Multimodal NeVA Pretrain TP=1') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1" -// sh "pip install webdataset==0.2.48" -// sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \ -// trainer.precision=bf16 \ -// model.megatron_amp_O2=False \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// trainer.val_check_interval=10 \ -// trainer.limit_val_batches=5 \ -// trainer.log_every_n_steps=1 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=2 \ -// model.global_batch_size=4 \ -// model.tensor_model_parallel_size=1 \ -// model.pipeline_model_parallel_size=1 \ -// exp_manager.create_checkpoint_callback=False \ -// model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \ -// model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \ -// model.tokenizer.library=sentencepiece \ -// model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \ -// model.num_layers=2 \ -// model.hidden_size=5120 \ -// model.ffn_hidden_size=13824 \ -// model.num_attention_heads=40 \ -// model.normalization=rmsnorm \ -// model.data.num_workers=0 \ -// model.data.conv_template=llama_2 \ -// model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \ -// model.mm_cfg.llm.from_pretrained=null \ -// model.use_flash_attention=false \ -// exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 " -// sh "pip install 'webdataset>=0.1.48,<=0.1.62'" -// sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1" -// } -// } + stage('L2: Vision ViT Pretrain TP=1') { + when { + anyOf { + branch 'r1.23.0' + changeRequest target: 'r1.23.0' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/vision/vit_pretrain_tp1" + sh "pip install webdataset==0.2.48" + sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \ + trainer.precision=16 \ + model.megatron_amp_O2=False \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + trainer.val_check_interval=5 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + trainer.max_steps=20 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.data.num_workers=0 \ + exp_manager.create_checkpoint_callback=False \ + model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \ + exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 " + sh "pip install 'webdataset>=0.1.48,<=0.1.62'" + sh "rm -rf /home/TestData/vision/vit_pretrain_tp1" + } + } + + stage('L2: Multimodal CLIP Pretrain TP=1') { + when { + anyOf { + branch 'r1.23.0' + changeRequest target: 'r1.23.0' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1" + sh "pip install webdataset==0.2.48" + sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py \ + trainer.precision=16 \ + model.megatron_amp_O2=False \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + trainer.val_check_interval=10 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + trainer.max_steps=20 \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + exp_manager.create_checkpoint_callback=False \ + model.data.num_workers=0 \ + model.vision.num_layers=2 \ + model.text.num_layers=2 \ + model.vision.patch_dim=32 \ + model.vision.encoder_seq_length=49 \ + model.vision.class_token_length=7 \ + model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \ + model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \ + model.data.webdataset.local_root_path=/ \ + exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 " + sh "pip install 'webdataset>=0.1.48,<=0.1.62'" + sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1" + } + } + + stage('L2: Multimodal NeVA Pretrain TP=1') { + when { + anyOf { + branch 'r1.23.0' + changeRequest target: 'r1.23.0' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1" + sh "pip install webdataset==0.2.48" + sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \ + trainer.precision=16 \ + model.megatron_amp_O2=False \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=5 \ + trainer.log_every_n_steps=1 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + trainer.max_steps=20 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + exp_manager.create_checkpoint_callback=False \ + model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \ + model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \ + model.tokenizer.library=sentencepiece \ + model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \ + model.num_layers=2 \ + model.hidden_size=5120 \ + model.ffn_hidden_size=13824 \ + model.num_attention_heads=40 \ + model.normalization=rmsnorm \ + model.data.num_workers=0 \ + model.data.conv_template=llama_2 \ + model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \ + model.mm_cfg.llm.from_pretrained=null \ + model.use_flash_attention=false \ + exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 " + sh "pip install 'webdataset>=0.1.48,<=0.1.62'" + sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1" + } + } // TODO: this requires TE >= v0.11 which is not available in 23.06. // please uncomment this test once mcore CI is ready. diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py index 545a634ac7fb..bd3f975e4d54 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py +++ b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py @@ -71,15 +71,16 @@ def main(cfg) -> None: with open(cfg.prompt_file, 'r') as f: lines = f.readlines() + insert_image_token = cfg.inference.get("insert_image_token", None) final_prompts = [] for line in lines: prompt_dict = json.loads(line) assert 'prompt' in prompt_dict or 'text' in prompt_dict if 'prompt' not in prompt_dict: prompt_dict['prompt'] = prompt_dict['text'] - if cfg.inference.insert_image_token == 'left': + if insert_image_token == 'left': prompt_dict['prompt'] = '' + prompt_dict['prompt'] - elif cfg.inference.insert_image_token == 'right': + elif insert_image_token == 'right': prompt_dict['prompt'] = prompt_dict['prompt'] + '' if 'image' in prompt_dict: prompt_dict['image_path'] = prompt_dict['image'] diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml index 8a21fccd0874..d8740bb98eb2 100644 --- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml +++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml @@ -79,7 +79,7 @@ model: openai_gelu: False bias_activation_fusion: False megatron_legacy: True - activation: quick-gelu + activation: approx-gelu @@ -144,7 +144,7 @@ model: fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. - activation: quick-gelu + activation: approx-gelu # Megatron O2-style half-precision megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters diff --git a/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml b/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml index 11dc65155cf5..be820e8d731d 100644 --- a/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml +++ b/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml @@ -117,7 +117,7 @@ model: openai_gelu: false bias_activation_fusion: false megatron_legacy: true - activation: quick-gelu + activation: approx-gelu text: precision: ${trainer.precision} @@ -171,7 +171,7 @@ model: fp8_amax_history_len: 1 fp8_amax_compute_algo: most_recent use_emha: false - activation: quick-gelu + activation: approx-gelu # Megatron O2-style half-precision megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 4dd6b120c8c8..90f862869369 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -397,7 +397,7 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict: if 'label' not in turn: turn[ 'label' - ] = "quality:6,toxicity:0,humor:0,creativity:0,violence:0,helpfulness:6,not_appropriate:0" + ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4" value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value'] conv.append_message(turn['from'], value) if not turn["value"]: diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 5fd0fa830dd0..3f4156d0fa73 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -612,8 +612,8 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None) output_tensor = self.model(**forward_args) return output_tensor - def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): - return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only) + def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None): + return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step) def training_step(self, dataloader_iter, batch_idx): """ @@ -623,7 +623,7 @@ def training_step(self, dataloader_iter, batch_idx): """ return MegatronGPTModel.training_step(self, dataloader_iter, batch_idx) - def get_forward_output_and_loss_func(self, validation_step=False): + def get_forward_output_and_loss_func(self, validation_step=False, tuning=False): def loss_func(output_tensor, loss_mask): loss_for_ub = self.loss_func(loss_mask, output_tensor) if validation_step and not self.cfg.data.get('validation_drop_last', True): @@ -921,7 +921,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: Returns: List of available pre-trained models. """ - return [] + return None def setup_test_data(self, cfg): pass diff --git a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py index 901745f09421..9bb490fb8fc8 100644 --- a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py +++ b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py @@ -40,7 +40,9 @@ class LatentDiffusionEdit(LatentDiffusion): - def init_from_ckpt(self, path, ignore_keys=list(), only_model=False): + def init_from_ckpt( + self, path, ignore_keys=list(), only_model=False, load_vae=True, load_unet=True, load_encoder=True, + ): pl_sd = torch.load(path, map_location="cpu") if "state_dict" in list(pl_sd.keys()): pl_sd = pl_sd["state_dict"] diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py index 31b56443846f..36dfb74fbfaf 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py @@ -557,9 +557,9 @@ def __init__(self, cfg, model_parallel_config): self.restarted_from_ckpt = False if ckpt_path is not None: - load_vae = True if cfg.load_vae is None else cfg.load_vae - load_unet = True if cfg.load_unet is None else cfg.load_unet - load_encoder = True if cfg.load_encoder is None else cfg.load_encoder + load_vae = True if cfg.get("load_vae", None) is None else cfg.load_vae + load_unet = True if cfg.get("load_unet", None) is None else cfg.load_unet + load_encoder = True if cfg.get("load_encoder", None) is None else cfg.load_encoder self.init_from_ckpt( ckpt_path, ignore_keys, load_vae=load_vae, load_unet=load_unet, load_encoder=load_encoder,