Skip to content

Commit f38a389

Browse files
committed
some renames
Signed-off-by: MengqingCao <cmq0113@163.com>
1 parent 775d67f commit f38a389

File tree

2 files changed

+24
-25
lines changed

2 files changed

+24
-25
lines changed

vllm_ascend/compilation/acl_graph.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class ACLGraphEntry:
2626
aclgraph: Optional[torch.npu.NPUGraph] = None
2727
output: Optional[Any] = None
2828

29-
# for cudagraph debugging, track the input addresses
29+
# for aclgraph debugging, track the input addresses
3030
# during capture, and check if they are the same during replay
3131
input_addresses: Optional[list[int]] = None
3232

@@ -35,16 +35,16 @@ class ACLGraphWrapper:
3535
"""Wraps a runnable to add acl graph capturing and replaying ability. And
3636
provide attribute access to the underlying `runnable` via `__getattr__`.
3737
38-
The workflow of this wrapper in the cudagraph dispatching is as follows:
38+
The workflow of this wrapper in the aclgraph dispatching is as follows:
3939
1. At initialization, a runtime mode is assigned to the wrapper (FULL or
4040
PIECEWISE).
4141
2. At runtime, the wrapper receives a runtime_mode and a
4242
batch_descriptor(key) from the forward context and blindly trust them
43-
for cudagraph dispatching.
43+
for aclgraph dispatching.
4444
3. If runtime_mode is NONE or runtime_mode does not match the mode of the
4545
wrapper, just call the runnable directly.
4646
4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper,
47-
the wrapper will perform cudagraph capture(if key does not exist, create
47+
the wrapper will perform aclgraph capture(if key does not exist, create
4848
a new entry and cache it) or replay (if key exists in the cache).
4949
5050
Note: ACLGraphWrapper does not store persistent buffers or copy any
@@ -71,7 +71,7 @@ def __init__(self,
7171
self.first_run_finished = False
7272
self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
7373

74-
# assert runtime_mode is not NONE(no cudagraph), otherwise, we don't
74+
# assert runtime_mode is not NONE(no aclgraph), otherwise, we don't
7575
# need to initialize a ACLGraphWrapper.
7676
assert self.runtime_mode != CUDAGraphMode.NONE
7777
if self.graph_pool is None:
@@ -81,7 +81,7 @@ def __init__(self,
8181
cudagraph_options = CUDAGraphOptions()
8282
self.aclgraph_options = cudagraph_options
8383
# the entries for different batch descriptors that we need to capture
84-
# cudagraphs for.
84+
# aclgraphs for.
8585
self.concrete_aclgraph_entries: dict[BatchDescriptor, ACLGraphEntry]\
8686
= {}
8787

@@ -90,7 +90,7 @@ def __getattr__(self, key: str):
9090
if hasattr(self.runnable, key):
9191
return getattr(self.runnable, key)
9292
raise AttributeError(f"Attribute {key} not exists in the runnable of "
93-
f"cudagraph wrapper: {self.runnable}")
93+
f"aclgraph wrapper: {self.runnable}")
9494

9595
def unwrap(self) -> Callable:
9696
# in case we need to access the original runnable.
@@ -104,7 +104,7 @@ def __call__(self, *args, **kwargs):
104104
if aclgraph_runtime_mode == CUDAGraphMode.NONE or \
105105
aclgraph_runtime_mode != self.runtime_mode:
106106
# CUDAGraphMode.NONE could mean the profile run, a warmup run, or
107-
# running without cudagraphs.
107+
# running without aclgraphs.
108108
# We do not trigger capture/replay if the runtime mode is not
109109
# matches. This enables properly dispatching to the correct
110110
# CUDAGraphWrapper when nesting multiple instances with different
@@ -120,13 +120,13 @@ def __call__(self, *args, **kwargs):
120120

121121
if entry.aclgraph is None:
122122
if self.aclgraph_options.debug_log_enable:
123-
# Since we capture cudagraph for many different shapes and
123+
# Since we capture aclgraph for many different shapes and
124124
# capturing is fast, we don't need to log it for every
125125
# shape. E.g. we only log it for the first subgraph in
126126
# piecewise mode.
127127
logger.debug("Capturing a aclgraph on (%s,%s)",
128128
self.runtime_mode.name, entry.batch_descriptor)
129-
# validate that cudagraph capturing is legal at this point.
129+
# validate that aclgraph capturing is legal at this point.
130130
validate_cudagraph_capturing_enabled()
131131

132132
input_addresses = [
@@ -137,10 +137,10 @@ def __call__(self, *args, **kwargs):
137137

138138
with ExitStack() as stack:
139139
if self.aclgraph_options.gc_disable:
140-
# during every model forward for piecewise cudagraph
141-
# mode, we will capture many pieces of cudagraphs
140+
# during every model forward for piecewise aclgraph
141+
# mode, we will capture many pieces of aclgraphs
142142
# (roughly one per layer). running gc again and again
143-
# across layers will make the cudagraph capture very slow.
143+
# across layers will make the aclgraph capture very slow.
144144
# therefore, we only run gc for the first graph,
145145
# and disable gc for the rest of the graphs.
146146
stack.enter_context(patch("gc.collect", lambda: None))
@@ -178,7 +178,7 @@ def __call__(self, *args, **kwargs):
178178
x.data_ptr() for x in args if isinstance(x, torch.Tensor)
179179
]
180180
assert new_input_addresses == entry.input_addresses, (
181-
f"Input addresses for cudagraphs are different "
181+
f"Input addresses for aclgraphs are different "
182182
f"during replay. Expected {entry.input_addresses}, "
183183
f"got {new_input_addresses}")
184184

vllm_ascend/worker/model_runner_v1.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2124,7 +2124,7 @@ def _dummy_pooler_run_task(
21242124
except RuntimeError as e:
21252125
if 'out of memory' in str(e):
21262126
raise RuntimeError(
2127-
"CUDA out of memory occurred when warming up pooler "
2127+
"NPU out of memory occurred when warming up pooler "
21282128
f"({task=}) with {num_reqs} dummy requests. Please try "
21292129
"lowering `max_num_seqs` or `gpu_memory_utilization` when "
21302130
"initializing the engine.") from e
@@ -2440,18 +2440,17 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
24402440

24412441
def initialize_aclgraph_capture(self) -> None:
24422442
# TODO: Add check of AttentionCGSupport and cudagraph_mode.decode_mode when full graph is supported
2443-
# Trigger cudagraph dispatching keys initialization here (after
2443+
# Trigger aclgraph dispatching keys initialization here (after
24442444
# initializing attn backends).
24452445
self.aclgraph_dispatcher.initialize_cudagraph_keys(
24462446
self.compilation_config.cudagraph_mode,
24472447
self.uniform_decode_query_len)
24482448

24492449
def _capture_aclgraphs(self, compilation_cases: list[int],
2450-
cudagraph_runtime_mode: CUDAGraphMode,
2450+
aclgraph_runtime_mode: CUDAGraphMode,
24512451
uniform_decode: bool):
2452-
assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \
2453-
cudagraph_runtime_mode in [CUDAGraphMode.FULL,
2454-
CUDAGraphMode.PIECEWISE]
2452+
assert aclgraph_runtime_mode != CUDAGraphMode.NONE and \
2453+
aclgraph_runtime_mode in [CUDAGraphMode.PIECEWISE]
24552454

24562455
# Only rank 0 should print progress bar during capture
24572456
if is_global_first_rank():
@@ -2460,7 +2459,7 @@ def _capture_aclgraphs(self, compilation_cases: list[int],
24602459
disable=not self.load_config.use_tqdm_on_load,
24612460
desc="Capturing ACL graphs ({}, {})".format(
24622461
"decode" if uniform_decode else "mixed prefill-decode",
2463-
cudagraph_runtime_mode.name))
2462+
aclgraph_runtime_mode.name))
24642463
# We skip EPLB here since we don't want to record dummy metrics
24652464
for num_tokens in compilation_cases:
24662465
for _ in range(self.compilation_config.cudagraph_num_of_warmups):
@@ -2470,13 +2469,13 @@ def _capture_aclgraphs(self, compilation_cases: list[int],
24702469
# different from the case where `FULL` implies capture
24712470
# attention while `PIECEWISE` implies no attention.
24722471
force_attention = (
2473-
cudagraph_runtime_mode == CUDAGraphMode.FULL)
2472+
aclgraph_runtime_mode == CUDAGraphMode.FULL)
24742473
self._dummy_run(num_tokens,
24752474
aclgraph_runtime_mode=CUDAGraphMode.NONE,
24762475
force_attention=force_attention,
24772476
uniform_decode=uniform_decode)
24782477
self._dummy_run(num_tokens,
2479-
aclgraph_runtime_mode=cudagraph_runtime_mode,
2478+
aclgraph_runtime_mode=aclgraph_runtime_mode,
24802479
uniform_decode=uniform_decode)
24812480

24822481
def _capture_model(self):
@@ -2492,10 +2491,10 @@ def _capture_model(self):
24922491
compilation_cases = list(reversed(self.aclgraph_batch_sizes))
24932492
self._capture_aclgraphs(
24942493
compilation_cases,
2495-
cudagraph_runtime_mode=aclgraph_runtime_mode,
2494+
aclgraph_runtime_mode=aclgraph_runtime_mode,
24962495
uniform_decode=False)
24972496

2498-
# Disable cudagraph capturing globally, so any unexpected cudagraph
2497+
# Disable aclgraph capturing globally, so any unexpected aclgraph
24992498
# capturing will be detected and raise an error after here.
25002499
# Note: We don't put it into graph_capture context manager because
25012500
# we may doing lazy capturing in future that still allows capturing

0 commit comments

Comments
 (0)