@@ -155,8 +155,24 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
155155 # Note: workaround for v1 gpu_model_runner
156156 from vllm .config import CompilationLevel
157157 vllm_config .compilation_config .cudagraph_capture_sizes = []
158- vllm_config .compilation_config .level = CompilationLevel .NO_COMPILATION
159- vllm_config .compilation_config .custom_ops = []
158+
159+ compilation_config = vllm_config .compilation_config
160+ if vllm_config .compilation_config .level == CompilationLevel .PIECEWISE :
161+ compilation_config .level = CompilationLevel .DYNAMO_ONCE
162+ compilation_config .backend = "inductor"
163+ compilation_config .custom_ops += ["none" ]
164+ compilation_config .inductor_compile_config .update ({
165+ "dce" :
166+ True ,
167+ "size_asserts" :
168+ False ,
169+ "nan_asserts" :
170+ False ,
171+ "memory_planning" :
172+ True ,
173+ "epilogue_fusion" :
174+ True ,
175+ })
160176
161177 assert vllm_config .device_config .device_type == "cpu"
162178
@@ -192,13 +208,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
192208 # To hint IPEX uses shared memory based AllReduce
193209 os .environ ["LOCAL_WORLD_SIZE" ] = str (
194210 vllm_config .parallel_config .tensor_parallel_size )
195- if sys .platform == "darwin" and \
196- envs .VLLM_WORKER_MULTIPROC_METHOD == "fork" :
197- if os .environ .get ('VLLM_WORKER_MULTIPROC_METHOD' , None ) is None :
198- logger .warning (
199- "Default to spawn method on MacOS. If this is not desired,"
200- " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly." )
201- os .environ ['VLLM_WORKER_MULTIPROC_METHOD' ] = 'spawn'
202211
203212 if vllm_config .model_config and vllm_config .model_config .use_mla :
204213 logger .info (
0 commit comments