File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change 119119 VLLM_SERVER_DEV_MODE : bool = False
120120 VLLM_V1_OUTPUT_PROC_CHUNK_SIZE : int = 128
121121 VLLM_MLA_DISABLE : bool = False
122- VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH : int = 16
122+ VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH : int = 32
123123 VLLM_RAY_PER_WORKER_GPUS : float = 1.0
124124 VLLM_RAY_BUNDLE_INDICES : str = ""
125125 VLLM_CUDART_SO_PATH : Optional [str ] = None
@@ -1017,7 +1017,7 @@ def get_vllm_port() -> Optional[int]:
10171017 # max number splits for cuda graph decode
10181018 "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH" :
10191019 lambda : int (os .getenv ("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH" ,
1020- "16 " )),
1020+ "32 " )),
10211021
10221022 # Number of GPUs per worker in Ray, if it is set to be a fraction,
10231023 # it allows ray to schedule multiple actors on a single GPU,
You can’t perform that action at this time.
0 commit comments