[Test] Adds tests to validate model generation with the full graph feature both enabled and disabled

yiz-liu · yiz-liu · commit 5987715201c9 · 2025-07-04T16:30:34.000+08:00
Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/tests/singlecard/test_aclgraph.py b/tests/singlecard/test_aclgraph.py
@@ -36,9 +36,11 @@
                     reason="aclgraph only support on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("full_graph", [False])
 def test_models(
     model: str,
     max_tokens: int,
+    full_graph: bool,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     with monkeypatch.context() as m:
@@ -54,7 +56,15 @@ def test_models(
                                          temperature=0.0)
         # TODO: change to use vllmrunner when the registry of custom op is solved
         # while running pytest
-        vllm_model = LLM(model)
+        if full_graph:
+            vllm_model = LLM(model,
+                             compilation_config={
+                                 "full_cuda_graph": True,
+                                 "cudagraph_capture_sizes":
+                                 [1, 4, 16, 64, 256]
+                             })
+        else:
+            vllm_model = LLM(model)
         vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
         del vllm_model
         torch.npu.empty_cache()
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -352,9 +352,10 @@ def forward(
                 if self.full_graph:
                     graph_params = get_graph_params()
                     q = query.view(num_tokens, -1, self.hidden_size)
-                    k = self.key_cache.view(-1, self.block_size,
-                                            self.num_kv_heads * self.head_size)
-                    v = self.value_cache.view(
+                    k = self.key_cache.view(  # type: ignore
+                        -1, self.block_size,
+                        self.num_kv_heads * self.head_size)
+                    v = self.value_cache.view(  # type: ignore
                         -1, self.block_size,
                         self.num_kv_heads * self.head_size)
                     actual_seq_lens = attn_metadata.seq_lens_list
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -11,7 +11,7 @@ class AscendCommonAttentionMetadata:
     cache groups and thus having different block table.
     """
 
-    query_start_loc: Optional[torch.Tensor] = None
+    query_start_loc: torch.Tensor = None
     """(batch_size + 1,), the start location of each request in query Tensor"""
     seq_lens: Optional[torch.Tensor] = None
     """(batch_size,), the length of each request including both computed tokens