[TRTLLM-6577][feat] Support nano_v2_vlm in pytorch backend

Wanli-Jiang · Wanli-Jiang · commit c6f2b3049761 · 2025-09-09T22:31:16.000-07:00
* Update according to reviewers' comments.

Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -99,8 +99,8 @@
 /tests/unittest/_torch/modeling/test_modeling_pixtral.py @NVIDIA/trt-llm-torch-models-vlm-devs @NVIDIA/trt-llm-torch-models-devs
 
 ### TensorRT-LLM Pytorch - Models - Nemotron
-/tensorrt_llm/_torch/models/modeling_nanov2vlm.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
-/tensorrt_llm/_torch/models/modeling_radio.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_nanov2vlm.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-vlm-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_radio.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-vlm-devs @NVIDIA/trt-llm-torch-models-devs
 /tensorrt_llm/_torch/models/modeling_nemotron_nas.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
 /tensorrt_llm/_torch/models/modeling_nemotron_h.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
 /tensorrt_llm/_torch/models/modeling_nemotron_nas.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
diff --git a/tensorrt_llm/_torch/models/modeling_nanov2vlm.py b/tensorrt_llm/_torch/models/modeling_nanov2vlm.py
@@ -40,21 +40,6 @@ def forward(self, x):
         return torch.pow(torch.nn.functional.relu(x), 2)
 
 
-class RMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-5):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.eps = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
-
-
 class NanoV2VLVisionEncoder(transformers.PreTrainedModel,
                             transformers.generation.GenerationMixin):
 
@@ -74,8 +59,8 @@ def __init__(self,
         self.vision_projection_hidden_size = config.projector_hidden_size
         self.llm_hidden_size = config.llm_config.hidden_size
         self.mlp1 = nn.Sequential(
-            RMSNorm(self.vit_hidden_size * int(1 / self.downsample_ratio)**2,
-                    eps=1e-5),
+            nn.RMSNorm(self.vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                       eps=config.llm_config.rms_norm_eps),
             nn.Linear(self.vit_hidden_size * int(1 / self.downsample_ratio)**2,
                       self.vision_projection_hidden_size,
                       bias=False), SquaredReLU(),
@@ -204,8 +189,7 @@ def get_mm_token_ids(self):
     def get_num_tokens_per_image(
         self,
         *,
-        image_width: int,
-        image_height: int,
+        image: Image.Image,
         **kwargs,
     ):
 
@@ -256,6 +240,8 @@ def calculate_targets(
 
             return blocks
 
+        image_height = image.height
+        image_width = image.width
         target_ratios = get_internvl_target_ratios(1,
                                                    self.processor.max_num_tiles)
         blocks = calculate_targets(image_width, image_height, target_ratios,
diff --git a/tensorrt_llm/_torch/models/modeling_radio.py b/tensorrt_llm/_torch/models/modeling_radio.py
@@ -14,6 +14,7 @@
 from transformers import PretrainedConfig, PreTrainedModel
 
 from tensorrt_llm._torch import model_config as model_config_lib
+from tensorrt_llm._torch.attention_backend import AttentionMetadata
 from tensorrt_llm._torch.attention_backend import \
     interface as attention_interface
 from tensorrt_llm._torch.attention_backend import utils as attention_utils
@@ -540,9 +541,8 @@ def __init__(
         act_layer = nn.GELU
 
         self.model_config = model_config
-        if self.model_config is not None:
-            self.config = model_config.pretrained_config
-            self.config.num_key_value_heads = num_heads
+        self.config = model_config.pretrained_config
+        self.config.num_key_value_heads = num_heads
 
         self.num_classes = num_classes
         self.global_pool = global_pool
@@ -622,28 +622,31 @@ def __init__(
         self.patch_size = patch_size
         self.num_cls_tokens = num_cls_tokens
         self.num_registers = self.patch_generator.num_registers
-        if self.model_config is not None:
-            self.metadata_cls = attention_utils.get_attention_backend(
-                model_config.attn_backend).Metadata
-        else:
-            self.metadata_cls = None
 
-    def prepare_attn_metadata(self, batch_size: int, seq_lengths: List[int]):
+        self.metadata_cls = attention_utils.get_attention_backend(
+            model_config.attn_backend).Metadata
+        self.attn_metadata = self.metadata_cls(
+            max_num_requests=8192,  # TODO: Make this dynamic
+            max_num_tokens=model_config.max_num_tokens,
+            kv_cache_manager=None,
+        )
+
+    def prepare_attn_metadata(self, batch_size: int, seq_lengths: List[int],
+                              attn_metadata: AttentionMetadata):
         """
         To simplify the usage of the model, this function aims to fill the metadata for Attention
         Call this function before forward pass
         """
+        prompt_lens = seq_lengths
+        seq_lens = torch.tensor(seq_lengths, dtype=torch.int, pin_memory=True)
         request_ids = list(range(1, batch_size + 1))
-        attn_metadata = self.metadata_cls(
-            seq_lens=torch.tensor(seq_lengths, dtype=torch.int),
-            num_contexts=batch_size,
-            max_num_requests=batch_size,
-            max_num_tokens=sum(seq_lengths),
-            kv_cache_manager=None,
-            request_ids=request_ids,
-            prompt_lens=seq_lengths,
-        )
-        attn_metadata.max_seq_len = max(seq_lengths)
+
+        attn_metadata.seq_lens = seq_lens
+        attn_metadata.num_contexts = batch_size
+        attn_metadata.request_ids = request_ids
+        attn_metadata.prompt_lens = prompt_lens
+        attn_metadata.max_seq_len = seq_lens.max().item()
+
         attn_metadata.prepare()
         return attn_metadata
 
@@ -652,13 +655,11 @@ def forward_features(self, x: torch.Tensor) -> torch.Tensor:
         x = self.patch_generator(x)
 
         batch_size, seq_len, hidden_size = x.shape
-        if self.model_config is not None:
-            seq_lengths = [seq_len] * batch_size
-            attn_metadata = self.prepare_attn_metadata(batch_size, seq_lengths)
-            # Need flatten batch/seq_len for trtllm attention.
-            x = x.reshape(batch_size * seq_len, hidden_size)
-        else:
-            attn_metadata = None
+        seq_lengths = [seq_len] * batch_size
+        attn_metadata = self.prepare_attn_metadata(batch_size, seq_lengths,
+                                                   self.attn_metadata)
+        # Need flatten batch/seq_len for trtllm attention.
+        x = x.reshape(batch_size * seq_len, hidden_size)
         for block in self.blocks:
             x = block(x, attn_metadata=attn_metadata)
         x = x.reshape(batch_size, seq_len, hidden_size)