@@ -26,7 +26,7 @@ class ACLGraphEntry:
2626 aclgraph : Optional [torch .npu .NPUGraph ] = None
2727 output : Optional [Any ] = None
2828
29- # for cudagraph debugging, track the input addresses
29+ # for aclgraph debugging, track the input addresses
3030 # during capture, and check if they are the same during replay
3131 input_addresses : Optional [list [int ]] = None
3232
@@ -35,16 +35,16 @@ class ACLGraphWrapper:
3535 """Wraps a runnable to add acl graph capturing and replaying ability. And
3636 provide attribute access to the underlying `runnable` via `__getattr__`.
3737
38- The workflow of this wrapper in the cudagraph dispatching is as follows:
38+ The workflow of this wrapper in the aclgraph dispatching is as follows:
3939 1. At initialization, a runtime mode is assigned to the wrapper (FULL or
4040 PIECEWISE).
4141 2. At runtime, the wrapper receives a runtime_mode and a
4242 batch_descriptor(key) from the forward context and blindly trust them
43- for cudagraph dispatching.
43+ for aclgraph dispatching.
4444 3. If runtime_mode is NONE or runtime_mode does not match the mode of the
4545 wrapper, just call the runnable directly.
4646 4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper,
47- the wrapper will perform cudagraph capture(if key does not exist, create
47+ the wrapper will perform aclgraph capture(if key does not exist, create
4848 a new entry and cache it) or replay (if key exists in the cache).
4949
5050 Note: ACLGraphWrapper does not store persistent buffers or copy any
@@ -71,7 +71,7 @@ def __init__(self,
7171 self .first_run_finished = False
7272 self .is_debugging_mode = envs .VLLM_LOGGING_LEVEL == "DEBUG"
7373
74- # assert runtime_mode is not NONE(no cudagraph ), otherwise, we don't
74+ # assert runtime_mode is not NONE(no aclgraph ), otherwise, we don't
7575 # need to initialize a ACLGraphWrapper.
7676 assert self .runtime_mode != CUDAGraphMode .NONE
7777 if self .graph_pool is None :
@@ -81,7 +81,7 @@ def __init__(self,
8181 cudagraph_options = CUDAGraphOptions ()
8282 self .aclgraph_options = cudagraph_options
8383 # the entries for different batch descriptors that we need to capture
84- # cudagraphs for.
84+ # aclgraphs for.
8585 self .concrete_aclgraph_entries : dict [BatchDescriptor , ACLGraphEntry ]\
8686 = {}
8787
@@ -90,7 +90,7 @@ def __getattr__(self, key: str):
9090 if hasattr (self .runnable , key ):
9191 return getattr (self .runnable , key )
9292 raise AttributeError (f"Attribute { key } not exists in the runnable of "
93- f"cudagraph wrapper: { self .runnable } " )
93+ f"aclgraph wrapper: { self .runnable } " )
9494
9595 def unwrap (self ) -> Callable :
9696 # in case we need to access the original runnable.
@@ -104,7 +104,7 @@ def __call__(self, *args, **kwargs):
104104 if aclgraph_runtime_mode == CUDAGraphMode .NONE or \
105105 aclgraph_runtime_mode != self .runtime_mode :
106106 # CUDAGraphMode.NONE could mean the profile run, a warmup run, or
107- # running without cudagraphs .
107+ # running without aclgraphs .
108108 # We do not trigger capture/replay if the runtime mode is not
109109 # matches. This enables properly dispatching to the correct
110110 # CUDAGraphWrapper when nesting multiple instances with different
@@ -120,13 +120,13 @@ def __call__(self, *args, **kwargs):
120120
121121 if entry .aclgraph is None :
122122 if self .aclgraph_options .debug_log_enable :
123- # Since we capture cudagraph for many different shapes and
123+ # Since we capture aclgraph for many different shapes and
124124 # capturing is fast, we don't need to log it for every
125125 # shape. E.g. we only log it for the first subgraph in
126126 # piecewise mode.
127127 logger .debug ("Capturing a aclgraph on (%s,%s)" ,
128128 self .runtime_mode .name , entry .batch_descriptor )
129- # validate that cudagraph capturing is legal at this point.
129+ # validate that aclgraph capturing is legal at this point.
130130 validate_cudagraph_capturing_enabled ()
131131
132132 input_addresses = [
@@ -137,10 +137,10 @@ def __call__(self, *args, **kwargs):
137137
138138 with ExitStack () as stack :
139139 if self .aclgraph_options .gc_disable :
140- # during every model forward for piecewise cudagraph
141- # mode, we will capture many pieces of cudagraphs
140+ # during every model forward for piecewise aclgraph
141+ # mode, we will capture many pieces of aclgraphs
142142 # (roughly one per layer). running gc again and again
143- # across layers will make the cudagraph capture very slow.
143+ # across layers will make the aclgraph capture very slow.
144144 # therefore, we only run gc for the first graph,
145145 # and disable gc for the rest of the graphs.
146146 stack .enter_context (patch ("gc.collect" , lambda : None ))
@@ -178,7 +178,7 @@ def __call__(self, *args, **kwargs):
178178 x .data_ptr () for x in args if isinstance (x , torch .Tensor )
179179 ]
180180 assert new_input_addresses == entry .input_addresses , (
181- f"Input addresses for cudagraphs are different "
181+ f"Input addresses for aclgraphs are different "
182182 f"during replay. Expected { entry .input_addresses } , "
183183 f"got { new_input_addresses } " )
184184
0 commit comments