[TRTLLM-6577][feat] Support nano_v2_vlm in pytorch backend

Wanli-Jiang · Wanli-Jiang · commit 034393e6d7f2 · 2025-08-28T10:28:19.000Z
* aligned output between HF codes and trtllm codes.

Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_nanov2vlm.py b/tensorrt_llm/_torch/models/modeling_nanov2vlm.py
@@ -29,6 +29,9 @@ def _is_disagg() -> bool:
     return os.getenv("TLLM_MULTIMODAL_DISAGGREGATED", "0") == "1"
 
 
+IMAGE_TOKEN_ID = 131072
+
+
 class SquaredReLU(nn.Module):
 
     def forward(self, x):
@@ -77,25 +80,22 @@ def __init__(self,
         self.llm_hidden_size = config.llm_config.hidden_size
 
         self.mlp1 = nn.Sequential(
-            # nn.LayerNorm(self.vit_hidden_size *
-            #              int(1 / self.downsample_ratio)**2,
-            #              bias=False),
             RMSNorm(self.vit_hidden_size * int(1 / self.downsample_ratio)**2,
                     eps=1e-5),
             nn.Linear(self.vit_hidden_size * int(1 / self.downsample_ratio)**2,
                       self.vision_projection_hidden_size,
-                      bias=False),
-            SquaredReLU(),
+                      bias=False), SquaredReLU(),
             nn.Linear(self.vision_projection_hidden_size,
                       self.llm_hidden_size,
                       bias=False))
         self.mlp1 = self.mlp1.to(config.torch_dtype)
 
-        # self.img_context_token_id = None
         WITH_HF_CODES = False
         if WITH_HF_CODES:
             self.vision_model = transformers.AutoModel.from_config(
                 config.vision_config, trust_remote_code=True)
+            # set input_condition as Identity module.
+            self.vision_model.radio_model.make_preprocessor_external()
             self.vision_model.to(config.torch_dtype)
 
             with open("hf_vision_encoder_arch.txt", "w") as f:
@@ -113,7 +113,6 @@ def __init__(self,
 
                 with open("trtllm_vision_encoder_arch.txt", "w") as f:
                     f.write(str(self.vision_model))
-
             else:
                 # Update the vision model with customized one.
                 from .modeling_radio import RADIOModel
@@ -218,6 +217,7 @@ def __init__(self,
         self.img_context_token = "<image>"
         self.img_start_token = "<img>"
         self.img_end_token = "</img>"
+        self.dtype = model_config.torch_dtype
 
     @torch.inference_mode()
     def __call__(
@@ -258,7 +258,8 @@ def __call__(
 
         # Will package inputs for language model forward in AGGREGATE mode.
         multimodal_data = {}
-        multimodal_data['pixel_values'] = processed_images['pixel_values']
+        multimodal_data['pixel_values'] = processed_images['pixel_values'].to(
+            self.dtype)
         multimodal_data['num_patches'] = processed_images['num_patches']
         return input_ids[0].to(torch.int32).tolist(), {
             "multimodal_data": multimodal_data,
@@ -271,7 +272,7 @@ def __call__(
     model_type="NemotronH_Nano_VL_V2",
     placeholder_metadata=MultimodalPlaceholderMetadata(
         placeholder_map={
-            "image": "<image>",
+            "image": "<image>\n",
         },
         placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
         placeholders_separator="",
@@ -321,38 +322,44 @@ def load_weights(self, weights):
             ) and m != "vision_model.radio_model.summary_idxs":
                 raise ValueError(f"Missing key: {m}")
         for u in unexpected_keys:
-            if not u.startswith('vision_model.radio_model.model.blocks.'):
+            if not u.startswith(
+                    'vision_model.radio_model.model.blocks.') and u not in [
+                        "vision_model.radio_model.input_conditioner.norm_mean",
+                        "vision_model.radio_model.input_conditioner.norm_std",
+                    ]:
                 raise ValueError(f"Unexpected key: {u}")
 
-        # Load weights for vision transformer module.
-        model_weights = {
-            k.replace('vision_model.radio_model.model.', ''): v
-            for k, v in weights.items()
-            if k.startswith('vision_model.radio_model.model.')
-        }
-        converted_weights = dict()
-        for name in model_weights:
-            # Handle with weights and bias for vision transformer's qkv projection.
-            if "attn.qkv." in name:
-                q_name = name.replace("attn.qkv.", "attn.q_proj.")
-                k_name = name.replace("attn.qkv.", "attn.k_proj.")
-                v_name = name.replace("attn.qkv.", "attn.v_proj.")
-                dim_shape = model_weights[name].shape[0] // 3
-                converted_weights[q_name] = model_weights[name][:dim_shape]
-                converted_weights[k_name] = model_weights[name][dim_shape:2 *
-                                                                dim_shape]
-                converted_weights[v_name] = model_weights[name][2 * dim_shape:]
-            else:
-                converted_weights[name] = model_weights[name]
-        pattern_mapping = {
-            r'(.*?)attn.proj.(.*)': r'\1attn.o_proj.\2',
-            r'(.*?)mlp.fc1.(.*)': r'\1mlp.up_proj.\2',
-            r'(.*?)mlp.fc2.(.*)': r'\1mlp.down_proj.\2',
-        }
-        modeling_utils._load_weights_impl(
-            self.vision_encoder.vision_model.radio_model.model,
-            converted_weights,
-            params_map=pattern_mapping)
+        if len(unexpected_keys) > 0 or len(missing_keys) > 1:
+            # Load weights for vision transformer module.
+            model_weights = {
+                k.replace('vision_model.radio_model.model.', ''): v
+                for k, v in weights.items()
+                if k.startswith('vision_model.radio_model.model.')
+            }
+            converted_weights = dict()
+            for name in model_weights:
+                # Handle with weights and bias for vision transformer's qkv projection.
+                if "attn.qkv." in name:
+                    q_name = name.replace("attn.qkv.", "attn.q_proj.")
+                    k_name = name.replace("attn.qkv.", "attn.k_proj.")
+                    v_name = name.replace("attn.qkv.", "attn.v_proj.")
+                    dim_shape = model_weights[name].shape[0] // 3
+                    converted_weights[q_name] = model_weights[name][:dim_shape]
+                    converted_weights[k_name] = model_weights[name][
+                        dim_shape:2 * dim_shape]
+                    converted_weights[v_name] = model_weights[name][2 *
+                                                                    dim_shape:]
+                else:
+                    converted_weights[name] = model_weights[name]
+            pattern_mapping = {
+                r'(.*?)attn.proj.(.*)': r'\1attn.o_proj.\2',
+                r'(.*?)mlp.fc1.(.*)': r'\1mlp.up_proj.\2',
+                r'(.*?)mlp.fc2.(.*)': r'\1mlp.down_proj.\2',
+            }
+            modeling_utils._load_weights_impl(
+                self.vision_encoder.vision_model.radio_model.model,
+                converted_weights,
+                params_map=pattern_mapping)
 
         # Load language model weights.
         filtered_weights = {
@@ -405,11 +412,8 @@ def forward(
             self.llm.model.embed_tokens,
             input_ids,
             mm_embedding,
-            mm_token_ids=torch.tensor([
-                131072
-            ], dtype=torch.int32),  # 131072 is the token id for the image token
+            mm_token_ids=torch.tensor([IMAGE_TOKEN_ID], dtype=torch.int32),
         )
-
         output_prob = self.llm.forward(
             attn_metadata=attn_metadata,
             input_ids=input_ids,
diff --git a/tensorrt_llm/_torch/models/modeling_radio.py b/tensorrt_llm/_torch/models/modeling_radio.py
@@ -85,27 +85,29 @@ def __init__(
         super().__init__(**kwargs)
 
 
-class InputConditioner(nn.Module):
+# class InputConditioner(nn.Module):
 
-    def __init__(
-        self,
-        input_scale: float,
-        norm_mean: norm_t,
-        norm_std: norm_t,
-        dtype: torch.dtype = None,
-    ):
-        super().__init__()
+#     def __init__(
+#         self,
+#         input_scale: float,
+#         norm_mean: norm_t,
+#         norm_std: norm_t,
+#         dtype: torch.dtype = None,
+#     ):
+#         super().__init__()
 
-        self.dtype = dtype
+#         self.dtype = dtype
 
-        self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
-        self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
+#         self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
+#         self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
 
-    def forward(self, x: torch.Tensor):
-        y = (x - self.norm_mean) / self.norm_std
-        if self.dtype is not None:
-            y = y.to(self.dtype)
-        return y
+#     def forward(self, x: torch.Tensor):
+#         y = (x - self.norm_mean) / self.norm_std
+#         if self.dtype is not None:
+#             y = y.to(self.dtype)
+#         return y
+
+InputConditioner = nn.Identity
 
 
 class ClsToken(nn.Module):
@@ -727,8 +729,9 @@ def __init__(
         act_layer = get_act_layer(act_layer) or nn.GELU
 
         self.model_config = model_config
-        self.config = model_config.pretrained_config
-        self.config.num_key_value_heads = num_heads
+        if self.model_config is not None:
+            self.config = model_config.pretrained_config
+            self.config.num_key_value_heads = num_heads
 
         self.num_classes = num_classes
         self.global_pool = global_pool
@@ -810,8 +813,11 @@ def __init__(
         self.patch_size = patch_size
         self.num_cls_tokens = num_cls_tokens
         self.num_registers = self.patch_generator.num_registers
-        self.metadata_cls = attention_utils.get_attention_backend(
-            model_config.attn_backend).Metadata
+        if self.model_config is not None:
+            self.metadata_cls = attention_utils.get_attention_backend(
+                model_config.attn_backend).Metadata
+        else:
+            self.metadata_cls = None
 
     def prepare_attn_metadata(self, batch_size: int, seq_lengths: List[int]):
         """