some renames

MengqingCao · MengqingCao · commit f38a389a8c0e · 2025-08-19T03:28:01.000Z
Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
@@ -26,7 +26,7 @@ class ACLGraphEntry:
     aclgraph: Optional[torch.npu.NPUGraph] = None
     output: Optional[Any] = None
 
-    # for cudagraph debugging, track the input addresses
+    # for aclgraph debugging, track the input addresses
     # during capture, and check if they are the same during replay
     input_addresses: Optional[list[int]] = None
 
@@ -35,16 +35,16 @@ class ACLGraphWrapper:
     """Wraps a runnable to add acl graph capturing and replaying ability. And
     provide attribute access to the underlying `runnable` via `__getattr__`.
 
-    The workflow of this wrapper in the cudagraph dispatching is as follows:
+    The workflow of this wrapper in the aclgraph dispatching is as follows:
     1. At initialization, a runtime mode is assigned to the wrapper (FULL or
     PIECEWISE). 
     2. At runtime, the wrapper receives a runtime_mode and a 
     batch_descriptor(key) from the forward context and blindly trust them
-    for cudagraph dispatching. 
+    for aclgraph dispatching. 
     3. If runtime_mode is NONE or runtime_mode does not match the mode of the
     wrapper, just call the runnable directly.
     4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper,
-    the wrapper will perform cudagraph capture(if key does not exist, create
+    the wrapper will perform aclgraph capture(if key does not exist, create
     a new entry and cache it) or replay (if key exists in the cache).
 
     Note: ACLGraphWrapper does not store persistent buffers or copy any
@@ -71,7 +71,7 @@ def __init__(self,
         self.first_run_finished = False
         self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
 
-        # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't
+        # assert runtime_mode is not NONE(no aclgraph), otherwise, we don't
         # need to initialize a ACLGraphWrapper.
         assert self.runtime_mode != CUDAGraphMode.NONE
         if self.graph_pool is None:
@@ -81,7 +81,7 @@ def __init__(self,
             cudagraph_options = CUDAGraphOptions()
         self.aclgraph_options = cudagraph_options
         # the entries for different batch descriptors that we need to capture
-        # cudagraphs for.
+        # aclgraphs for.
         self.concrete_aclgraph_entries: dict[BatchDescriptor, ACLGraphEntry]\
                                                                         = {}
 
@@ -90,7 +90,7 @@ def __getattr__(self, key: str):
         if hasattr(self.runnable, key):
             return getattr(self.runnable, key)
         raise AttributeError(f"Attribute {key} not exists in the runnable of "
-                             f"cudagraph wrapper: {self.runnable}")
+                             f"aclgraph wrapper: {self.runnable}")
 
     def unwrap(self) -> Callable:
         # in case we need to access the original runnable.
@@ -104,7 +104,7 @@ def __call__(self, *args, **kwargs):
         if aclgraph_runtime_mode == CUDAGraphMode.NONE or \
                             aclgraph_runtime_mode != self.runtime_mode:
             # CUDAGraphMode.NONE could mean the profile run, a warmup run, or
-            # running without cudagraphs.
+            # running without aclgraphs.
             # We do not trigger capture/replay if the runtime mode is not
             # matches. This enables properly dispatching to the correct
             # CUDAGraphWrapper when nesting multiple instances with different
@@ -120,13 +120,13 @@ def __call__(self, *args, **kwargs):
 
         if entry.aclgraph is None:
             if self.aclgraph_options.debug_log_enable:
-                # Since we capture cudagraph for many different shapes and
+                # Since we capture aclgraph for many different shapes and
                 # capturing is fast, we don't need to log it for every
                 # shape. E.g. we only log it for the first subgraph in
                 # piecewise mode.
                 logger.debug("Capturing a aclgraph on (%s,%s)",
                              self.runtime_mode.name, entry.batch_descriptor)
-            # validate that cudagraph capturing is legal at this point.
+            # validate that aclgraph capturing is legal at this point.
             validate_cudagraph_capturing_enabled()
 
             input_addresses = [
@@ -137,10 +137,10 @@ def __call__(self, *args, **kwargs):
 
             with ExitStack() as stack:
                 if self.aclgraph_options.gc_disable:
-                    # during every model forward for piecewise cudagraph
-                    # mode, we will capture many pieces of cudagraphs
+                    # during every model forward for piecewise aclgraph
+                    # mode, we will capture many pieces of aclgraphs
                     # (roughly one per layer). running gc again and again
-                    # across layers will make the cudagraph capture very slow.
+                    # across layers will make the aclgraph capture very slow.
                     # therefore, we only run gc for the first graph,
                     # and disable gc for the rest of the graphs.
                     stack.enter_context(patch("gc.collect", lambda: None))
@@ -178,7 +178,7 @@ def __call__(self, *args, **kwargs):
                 x.data_ptr() for x in args if isinstance(x, torch.Tensor)
             ]
             assert new_input_addresses == entry.input_addresses, (
-                f"Input addresses for cudagraphs are different "
+                f"Input addresses for aclgraphs are different "
                 f"during replay. Expected {entry.input_addresses}, "
                 f"got {new_input_addresses}")
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2124,7 +2124,7 @@ def _dummy_pooler_run_task(
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 raise RuntimeError(
-                    "CUDA out of memory occurred when warming up pooler "
+                    "NPU out of memory occurred when warming up pooler "
                     f"({task=}) with {num_reqs} dummy requests. Please try "
                     "lowering `max_num_seqs` or `gpu_memory_utilization` when "
                     "initializing the engine.") from e
@@ -2440,18 +2440,17 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
     def initialize_aclgraph_capture(self) -> None:
         # TODO: Add check of AttentionCGSupport and cudagraph_mode.decode_mode when full graph is supported
-        # Trigger cudagraph dispatching keys initialization here (after
+        # Trigger aclgraph dispatching keys initialization here (after
         # initializing attn backends).
         self.aclgraph_dispatcher.initialize_cudagraph_keys(
             self.compilation_config.cudagraph_mode,
             self.uniform_decode_query_len)
 
     def _capture_aclgraphs(self, compilation_cases: list[int],
-                           cudagraph_runtime_mode: CUDAGraphMode,
+                           aclgraph_runtime_mode: CUDAGraphMode,
                            uniform_decode: bool):
-        assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \
-            cudagraph_runtime_mode in [CUDAGraphMode.FULL,
-                                        CUDAGraphMode.PIECEWISE]
+        assert aclgraph_runtime_mode != CUDAGraphMode.NONE and \
+            aclgraph_runtime_mode in [CUDAGraphMode.PIECEWISE]
 
         # Only rank 0 should print progress bar during capture
         if is_global_first_rank():
@@ -2460,7 +2459,7 @@ def _capture_aclgraphs(self, compilation_cases: list[int],
                 disable=not self.load_config.use_tqdm_on_load,
                 desc="Capturing ACL graphs ({}, {})".format(
                     "decode" if uniform_decode else "mixed prefill-decode",
-                    cudagraph_runtime_mode.name))
+                    aclgraph_runtime_mode.name))
         # We skip EPLB here since we don't want to record dummy metrics
         for num_tokens in compilation_cases:
             for _ in range(self.compilation_config.cudagraph_num_of_warmups):
@@ -2470,13 +2469,13 @@ def _capture_aclgraphs(self, compilation_cases: list[int],
                 # different from the case where `FULL` implies capture
                 # attention while `PIECEWISE` implies no attention.
                 force_attention = (
-                    cudagraph_runtime_mode == CUDAGraphMode.FULL)
+                    aclgraph_runtime_mode == CUDAGraphMode.FULL)
                 self._dummy_run(num_tokens,
                                 aclgraph_runtime_mode=CUDAGraphMode.NONE,
                                 force_attention=force_attention,
                                 uniform_decode=uniform_decode)
             self._dummy_run(num_tokens,
-                            aclgraph_runtime_mode=cudagraph_runtime_mode,
+                            aclgraph_runtime_mode=aclgraph_runtime_mode,
                             uniform_decode=uniform_decode)
 
     def _capture_model(self):
@@ -2492,10 +2491,10 @@ def _capture_model(self):
                 compilation_cases = list(reversed(self.aclgraph_batch_sizes))
                 self._capture_aclgraphs(
                     compilation_cases,
-                    cudagraph_runtime_mode=aclgraph_runtime_mode,
+                    aclgraph_runtime_mode=aclgraph_runtime_mode,
                     uniform_decode=False)
 
-        # Disable cudagraph capturing globally, so any unexpected cudagraph
+        # Disable aclgraph capturing globally, so any unexpected aclgraph
         # capturing will be detected and raise an error after here.
         # Note: We don't put it into graph_capture context manager because
         # we may doing lazy capturing in future that still allows capturing