[TRTLLM-6577][feat] Support nano_v2_vlm in pytorch backend

Wanli-Jiang · Wanli-Jiang · commit 1f10d0c4024a · 2025-09-09T01:49:43.000-07:00
* clean up codes.
* add test_e2e for nano_v2 vlm.

Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_nanov2vlm.py b/tensorrt_llm/_torch/models/modeling_nanov2vlm.py
@@ -279,6 +279,11 @@ def __call__(
                     Image.fromarray((image.permute(1, 2, 0) * 255).to(
                         torch.uint8).cpu().numpy()) for image in images
                 ]
+        else:
+            input_ids = self.tokenizer.encode(text_prompt,
+                                              add_special_tokens=False,
+                                              return_tensors="pt")
+            return input_ids[0].to(torch.int32).tolist(), {}
 
         # Processing for multimodal data.
         processed_images = self.processor(images=images,
diff --git a/tensorrt_llm/_torch/models/modeling_radio.py b/tensorrt_llm/_torch/models/modeling_radio.py
@@ -22,8 +22,13 @@
 
 input_dim_t = Union[int, Tuple[int, int]]
 
-# Need for model weight loading.
-NUM_ATTENTION_HEADS = 16
+# Model parameters which is not in config.json.
+# TODO: read from config.json when it is released.
+NUM_ATTENTION_HEADS_FOR_VIT = 16
+IMAGE_SIZE_FOR_VIT = 224
+PATCH_SIZE_FOR_VIT = 16
+EMBED_DIM_FOR_VIT = 1280
+DEPTH_FOR_VIT = 32
 
 
 class Resolution(NamedTuple):
@@ -34,7 +39,7 @@ class Resolution(NamedTuple):
 class RADIOConfig(PretrainedConfig):
     """Pretrained Hugging Face configuration for RADIO models.
 
-    Copy from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/hf_model.py.
+    Modified from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/hf_model.py.
     """
 
     def __init__(
@@ -55,8 +60,7 @@ def __init__(
         for field in ["dtype", "amp_dtype"]:
             if self.args is not None and field in self.args:
                 # Convert to a string in order to make it serializable.
-                # For example for torch.float32 we will store "float32",
-                # for "bfloat16" we will store "bfloat16".
+                # For example for torch.float32 we will store "float32".
                 self.args[field] = str(args[field]).split(".")[-1]
         self.version = version
         self.patch_size = patch_size
@@ -68,13 +72,13 @@ def __init__(
         self.vitdet_window_size = vitdet_window_size
         self.feature_normalizer_config = feature_normalizer_config
         self.inter_feature_normalizer_config = inter_feature_normalizer_config
-        self.num_key_value_heads = NUM_ATTENTION_HEADS
-        self.num_attention_heads = NUM_ATTENTION_HEADS
+        self.num_key_value_heads = NUM_ATTENTION_HEADS_FOR_VIT
+        self.num_attention_heads = NUM_ATTENTION_HEADS_FOR_VIT
         super().__init__(**kwargs)
 
 
 class ClsToken(nn.Module):
-    """Copy from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/cls_token.py."""
+    """Modified from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/cls_token.py."""
 
     def __init__(
         self,
@@ -115,7 +119,7 @@ def forward(self, x: torch.Tensor):
 
 
 class ViTPatchGenerator(nn.Module):
-    """Copy from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/vit_patch_generator.py."""
+    """Modified from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/vit_patch_generator.py."""
 
     def __init__(
         self,
@@ -132,8 +136,6 @@ def __init__(
         register_multiple: Optional[int] = None,
         num_registers: Optional[int] = None,
         patch_bias: bool = False,
-        device=None,
-        dtype=None,
     ):
         super().__init__()
 
@@ -151,42 +153,31 @@ def __init__(
         self.cpe_mode = max_input_dims != input_dims
         self.pos_dropout = pos_dropout
         self.return_pos_enc = return_pos_enc
-
-        factory = dict(device=device, dtype=dtype)
-
         self.patch_size = patch_size
         self.abs_pos = abs_pos
         self.embed_dim = embed_dim
-
         self.num_rows = max_input_dims[0] // patch_size
         self.num_cols = max_input_dims[1] // patch_size
         self.input_dims = tuple(d // patch_size for d in input_dims)
         self.num_patches = self.num_rows * self.num_cols
         self.max_input_dims = max_input_dims
 
         self.im_to_patches = Im2Patches(patch_size)
-        self.embedder = ViTPatchLinear(patch_size,
-                                       embed_dim,
-                                       bias=patch_bias,
-                                       **factory)
-
+        self.embedder = ViTPatchLinear(patch_size, embed_dim, bias=patch_bias)
         if abs_pos:
             scale = embed_dim**-0.5
             self.pos_embed = nn.Parameter(
-                torch.randn(1, self.num_patches, embed_dim, **factory) * scale)
-
+                torch.randn(1, self.num_patches, embed_dim) * scale)
         self.cls_token = ClsToken(
             embed_dim,
             num_tokens=num_cls_tokens,
             enabled=cls_token,
             register_multiple=register_multiple,
             num_registers=num_registers,
         )
-
         self.patch_normalizer = nn.LayerNorm(
             embed_dim) if normalize_patches else nn.Identity()
 
-    @torch.compile
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         patches = self.embed_patches(x)
         patches, pos_enc = self.apply_pos_enc(patches, input_size=x.shape[2:])
@@ -265,7 +256,6 @@ def window_select(pos_embed):
                                       size=(max_dim, max_dim),
                                       align_corners=True,
                                       mode='bilinear').to(pos_embed.dtype)
-
             pos_embed = window_select(pos_embed)
         else:
             pos_embed = window_select(pos_embed)
@@ -277,12 +267,11 @@ def window_select(pos_embed):
                                       mode='bilinear').to(pos_embed.dtype)
 
         pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
-
         return pos_embed
 
 
 class Im2Patches(nn.Module):
-    """Copy from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/vit_patch_generator.py."""
+    """Modified from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/vit_patch_generator.py."""
 
     def __init__(self, patch_size: int):
         super().__init__()
@@ -308,22 +297,23 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class ViTPatchLinear(nn.Linear):
-    """Copy from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/vit_patch_generator.py."""
-
-    def __init__(self,
-                 patch_size: int,
-                 embed_dim: int,
-                 bias: bool = False,
-                 **kwargs):
-        super().__init__(3 * (patch_size**2), embed_dim, bias=bias, **kwargs)
+    """Modified from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/vit_patch_generator.py."""
+
+    def __init__(
+        self,
+        patch_size: int,
+        embed_dim: int,
+        bias: bool = False,
+    ):
+        super().__init__(3 * (patch_size**2), embed_dim, bias=bias)
         self.patch_size = patch_size
 
 
 class Block(nn.Module):
     """Transformer block with pre-normalization.
 
-    Copy from https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
-    and use trtllm_attn and trtllm_mlp to replace attn and mlp.
+    Modified from https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+    Use trtllm_attn and trtllm_mlp to replace original attention and mlp layers.
     """
 
     def __init__(
@@ -378,16 +368,16 @@ def __init__(
             hidden_size=dim,
             num_attention_heads=num_heads,
             num_key_value_heads=num_heads,
+            max_position_embeddings=None,
             bias=qkv_bias,
-            dense_bias=proj_bias,
-            dtype=self.model_config.torch_dtype,
-            layer_idx=layer_idx,
             pos_embd_params=None,
             rope_fusion=None,
+            layer_idx=layer_idx,
+            dtype=self.model_config.torch_dtype,
+            dense_bias=proj_bias,
+            config=self.model_config,
             q_scaling=1.0,
             attention_chunk_size=None,
-            config=self.model_config,
-            max_position_embeddings=None,
         )
         if init_values:
             raise IOError(
@@ -399,8 +389,6 @@ def __init__(
                 "Limited RADIO model support: Block does not support DropPath for now."
             )
         self.drop_path1 = nn.Identity()
-
-        self.norm2 = norm_layer(dim)
         if scale_mlp_norm:
             raise IOError(
                 "Limited RADIO model support: Block does not support scale_mlp_norm for now."
@@ -409,6 +397,7 @@ def __init__(
             raise IOError(
                 "Limited RADIO model support: Block does not support proj_drop for now."
             )
+        self.norm2 = norm_layer(dim)
 
         self.mlp = trtllm_mlp.MLP(
             hidden_size=dim,
@@ -442,8 +431,7 @@ def forward(
             position_ids=None,
             hidden_states=x,
             attn_metadata=attn_metadata,
-            attention_mask=attention_interface.PredefinedAttentionMask.
-            FULL  # Always FULL for Vision
+            attention_mask=attention_interface.PredefinedAttentionMask.FULL,
         )
         x = self.ls1(x)
         x = self.drop_path1(x)
@@ -461,7 +449,7 @@ def forward(
 class VisionTransformer(nn.Module):
     """ Vision Transformer.
 
-    Copy from https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py.
+    Modified from https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py.
     """
 
     def __init__(
@@ -535,9 +523,11 @@ def __init__(
             **kwargs: Additional keyword arguments, to store unused arguments.
         """
         super().__init__()
-        assert global_pool in ('', 'avg', 'avgmax', 'max', 'token', 'map')
-        assert class_token or global_pool != 'token'
-        assert pos_embed in ('', 'none', 'learn')
+        if not (class_token or global_pool != 'token'):
+            raise ValueError(
+                "Class token must be used with global_pool == 'token'")
+        if pos_embed not in ('', 'none', 'learn'):
+            raise ValueError(f"Invalid pos_embed: {pos_embed}")
         use_fc_norm = global_pool in ('avg', 'avgmax',
                                       'max') if fc_norm is None else fc_norm
 
@@ -555,7 +545,7 @@ def __init__(
 
         self.num_classes = num_classes
         self.global_pool = global_pool
-        self.num_features = self.head_hidden_size = self.embed_dim = embed_dim  # for consistency with other models
+        self.num_features = self.head_hidden_size = self.embed_dim = embed_dim
         self.num_prefix_tokens = 1 if class_token else 0
         self.num_prefix_tokens += reg_tokens
         self.num_reg_tokens = reg_tokens
@@ -565,7 +555,7 @@ def __init__(
         self.patch_drop = nn.Identity()
         self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
 
-        # stochastic depth decay rule
+        # Stochastic depth decay rule.
         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
         self.blocks = nn.ModuleList([
             Block(
@@ -590,7 +580,7 @@ def __init__(
         self.norm = norm_layer(
             embed_dim) if final_norm and not use_fc_norm else nn.Identity()
 
-        # Classifier Head but not used for RADIO embedding models.
+        # Initialize classifier head but not used for RADIO embedding models.
         self.attn_pool = None
         self.fc_norm = norm_layer(
             embed_dim) if final_norm and use_fc_norm else nn.Identity()
@@ -664,9 +654,8 @@ def forward_features(self, x: torch.Tensor) -> torch.Tensor:
         if self.model_config is not None:
             seq_lengths = [seq_len] * batch_size
             attn_metadata = self.prepare_attn_metadata(batch_size, seq_lengths)
-            x = x.reshape(
-                batch_size * seq_len,
-                hidden_size)  # Need flatten batch/seq_len for trtllm attention.
+            # Need flatten batch/seq_len for trtllm attention.
+            x = x.reshape(batch_size * seq_len, hidden_size)
         else:
             attn_metadata = None
         for block in self.blocks:
@@ -678,7 +667,7 @@ def forward_features(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class RADIOVisionModelBase(nn.Module):
-    """Copy and modify from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/radio_model.py"""
+    """Modify from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/radio_model.py"""
 
     def __init__(
         self,
@@ -783,17 +772,13 @@ def get_nearest_supported_resolution(self, height: int,
             round(height / self.min_resolution_step) * self.min_resolution_step)
         width = int(
             round(width / self.min_resolution_step) * self.min_resolution_step)
-
         height = max(height, self.min_resolution_step)
         width = max(width, self.min_resolution_step)
-
         return Resolution(height=height, width=width)
 
-    def forward(
-        self,
-        x: torch.Tensor,
-        feature_fmt: str = 'NLC'
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    def forward(self,
+                x: torch.Tensor,
+                feature_fmt: str = 'NLC') -> torch.Tensor:
         res_step = self.min_resolution_step
         if res_step is not None and (x.shape[-2] % res_step != 0
                                      or x.shape[-1] % res_step != 0):
@@ -807,7 +792,6 @@ def forward(
         ret = self._extract_final(x, y, feature_fmt=feature_fmt)
         return ret
 
-    @torch.compile
     def _extract_final(self,
                        x: torch.Tensor,
                        y: torch.Tensor,
@@ -836,12 +820,11 @@ def _extract_final(self,
             raise ValueError(
                 f'Unsupported feature_fmt: {feature_fmt}. Must be one of ["NLC", "NCHW"]'
             )
-
         return fmt_feat
 
 
 class RADIOVisionModel(PreTrainedModel):
-    """Copy and modify from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/hf_model.py."""
+    """Modify from https://huggingface.co/nvidia/C-RADIOv2-H/blob/main/hf_model.py."""
 
     def __init__(self, model_config: model_config_lib.ModelConfig):
         """
@@ -863,11 +846,11 @@ def __init__(self, model_config: model_config_lib.ModelConfig):
         elif args.input_size is not None:
             in_chans = args.input_size[0]
         vit_model = VisionTransformer(
-            img_size=224,
-            patch_size=16,
-            embed_dim=1280,
-            depth=32,
-            num_heads=NUM_ATTENTION_HEADS,
+            img_size=IMAGE_SIZE_FOR_VIT,
+            patch_size=PATCH_SIZE_FOR_VIT,
+            embed_dim=EMBED_DIM_FOR_VIT,
+            depth=DEPTH_FOR_VIT,
+            num_heads=NUM_ATTENTION_HEADS_FOR_VIT,
             in_chans=in_chans,
             drop_rate=args.drop,
             special_args=args,
@@ -920,11 +903,11 @@ def load_weights(self, weights):
         }
         missing_keys, unexpected_keys = self.radio_model.load_state_dict(
             filter_weights, strict=False)
-
         # Check missing and unexpected keys.
         # The input conditioner is not initialized in current implementation.
         unexpected_keys.remove("input_conditioner.norm_mean")
         unexpected_keys.remove("input_conditioner.norm_std")
+        # Partial model.blocks weights will loaded in the following step.
         for m in missing_keys:
             if not m.startswith('model.blocks.'):
                 raise ValueError(f"Missing key: {m}")
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py