NVIDIA · ericharper · Feb 6, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 23, 2024
diff --git a/Jenkinsfile b/Jenkinsfile
diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -71,15 +71,16 @@ def main(cfg) -> None:
     with open(cfg.prompt_file, 'r') as f:
         lines = f.readlines()
 
+    insert_image_token = cfg.inference.get("insert_image_token", None)
     final_prompts = []
     for line in lines:
         prompt_dict = json.loads(line)
         assert 'prompt' in prompt_dict or 'text' in prompt_dict
         if 'prompt' not in prompt_dict:
             prompt_dict['prompt'] = prompt_dict['text']
-        if cfg.inference.insert_image_token == 'left':
+        if insert_image_token == 'left':
             prompt_dict['prompt'] = '<image>' + prompt_dict['prompt']
-        elif cfg.inference.insert_image_token == 'right':
+        elif insert_image_token == 'right':
             prompt_dict['prompt'] = prompt_dict['prompt'] + '<image>'
         if 'image' in prompt_dict:
             prompt_dict['image_path'] = prompt_dict['image']

diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
@@ -79,7 +79,7 @@ model:
     openai_gelu: False
     bias_activation_fusion: False
     megatron_legacy: True
-    activation: quick-gelu
+    activation: approx-gelu
 
 
 
@@ -144,7 +144,7 @@ model:
     fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
     fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
     use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
-    activation: quick-gelu
+    activation: approx-gelu
 
   # Megatron O2-style half-precision
   megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters

diff --git a/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml b/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml
@@ -117,7 +117,7 @@ model:
     openai_gelu: false
     bias_activation_fusion: false
     megatron_legacy: true
-    activation: quick-gelu
+    activation: approx-gelu
 
   text:
     precision: ${trainer.precision}
@@ -171,7 +171,7 @@ model:
     fp8_amax_history_len: 1
     fp8_amax_compute_algo: most_recent
     use_emha: false
-    activation: quick-gelu
+    activation: approx-gelu
 
   # Megatron O2-style half-precision
   megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters

diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -397,7 +397,7 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
                 if 'label' not in turn:
                     turn[
                         'label'
-                    ] = "quality:6,toxicity:0,humor:0,creativity:0,violence:0,helpfulness:6,not_appropriate:0"
+                    ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
                 value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
                 conv.append_message(turn['from'], value)
                 if not turn["value"]:

diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -612,8 +612,8 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None)
         output_tensor = self.model(**forward_args)
         return output_tensor
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
-        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only)
+    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None):
+        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step)
 
     def training_step(self, dataloader_iter, batch_idx):
         """
@@ -623,7 +623,7 @@ def training_step(self, dataloader_iter, batch_idx):
         """
         return MegatronGPTModel.training_step(self, dataloader_iter, batch_idx)
 
-    def get_forward_output_and_loss_func(self, validation_step=False):
+    def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
         def loss_func(output_tensor, loss_mask):
             loss_for_ub = self.loss_func(loss_mask, output_tensor)
             if validation_step and not self.cfg.data.get('validation_drop_last', True):
@@ -921,7 +921,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         Returns:
             List of available pre-trained models.
         """
-        return []
+        return None
 
     def setup_test_data(self, cfg):
         pass

diff --git a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py
@@ -40,7 +40,9 @@
 
 
 class LatentDiffusionEdit(LatentDiffusion):
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+    def init_from_ckpt(
+        self, path, ignore_keys=list(), only_model=False, load_vae=True, load_unet=True, load_encoder=True,
+    ):
         pl_sd = torch.load(path, map_location="cpu")
         if "state_dict" in list(pl_sd.keys()):
             pl_sd = pl_sd["state_dict"]

diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -557,9 +557,9 @@ def __init__(self, cfg, model_parallel_config):
 
         self.restarted_from_ckpt = False
         if ckpt_path is not None:
-            load_vae = True if cfg.load_vae is None else cfg.load_vae
-            load_unet = True if cfg.load_unet is None else cfg.load_unet
-            load_encoder = True if cfg.load_encoder is None else cfg.load_encoder
+            load_vae = True if cfg.get("load_vae", None) is None else cfg.load_vae
+            load_unet = True if cfg.get("load_unet", None) is None else cfg.load_unet
+            load_encoder = True if cfg.get("load_encoder", None) is None else cfg.load_encoder
 
             self.init_from_ckpt(
                 ckpt_path, ignore_keys, load_vae=load_vae, load_unet=load_unet, load_encoder=load_encoder,