|
119 | 119 | VLLM_SERVER_DEV_MODE: bool = False |
120 | 120 | VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 |
121 | 121 | VLLM_MLA_DISABLE: bool = False |
| 122 | + VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16 |
122 | 123 | VLLM_RAY_PER_WORKER_GPUS: float = 1.0 |
123 | 124 | VLLM_RAY_BUNDLE_INDICES: str = "" |
124 | 125 | VLLM_CUDART_SO_PATH: Optional[str] = None |
@@ -946,6 +947,12 @@ def get_vllm_port() -> Optional[int]: |
946 | 947 | "VLLM_MLA_DISABLE": |
947 | 948 | lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), |
948 | 949 |
|
| 950 | + # If set, vLLM will pick up the provided Flash Attention MLA |
| 951 | + # max number splits for cuda graph decode |
| 952 | + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": |
| 953 | + lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", |
| 954 | + "16")), |
| 955 | + |
949 | 956 | # Number of GPUs per worker in Ray, if it is set to be a fraction, |
950 | 957 | # it allows ray to schedule multiple actors on a single GPU, |
951 | 958 | # so that users can colocate other actors on the same GPUs as vLLM. |
@@ -1379,6 +1386,7 @@ def compute_hash() -> str: |
1379 | 1386 | environment_variables_to_hash = [ |
1380 | 1387 | "VLLM_PP_LAYER_PARTITION", |
1381 | 1388 | "VLLM_MLA_DISABLE", |
| 1389 | + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", |
1382 | 1390 | "VLLM_USE_TRITON_FLASH_ATTN", |
1383 | 1391 | "VLLM_USE_TRITON_AWQ", |
1384 | 1392 | "VLLM_DP_RANK", |
|
0 commit comments