[UX] Enforce valid choices for envs like VLLM_ATTENTION_BACKEND, etc (#24761)

mgoin · web-flow · commit dd83a157f12c · 2025-09-16T20:42:23.000-07:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
Signed-off-by: Michael Goin &lt;mgoin64@gmail.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -6,7 +6,7 @@
 import os
 import sys
 import tempfile
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
 
 if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
@@ -56,11 +56,12 @@
     VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
+    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl",
+                                                    "shm"] = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
     VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
     VLLM_XLA_USE_SPMD: bool = False
-    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
+    VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
@@ -77,7 +78,8 @@
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
-    CMAKE_BUILD_TYPE: Optional[str] = None
+    CMAKE_BUILD_TYPE: Optional[Literal["Debug", "Release",
+                                       "RelWithDebInfo"]] = None
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
@@ -140,22 +142,25 @@
     VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput",
+                                         "latency"] = "throughput"
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
-    VLLM_ALL2ALL_BACKEND: str = "naive"
+    VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput",
+                                  "deepep_low_latency"] = "naive"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
     VLLM_SLEEP_WHEN_IDLE: bool = False
     VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
     VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
-    VLLM_KV_CACHE_LAYOUT: Optional[str] = None
+    VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None
     VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
     VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
-    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
+    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal["FP", "INT8", "INT6", "INT4",
+                                                 "NONE"] = "NONE"
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
     VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
@@ -207,6 +212,48 @@ def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
     return bool(int(value))
 
 
+def env_with_choices(
+        env_name: str,
+        default: Optional[str],
+        choices: Union[list[str], Callable[[], list[str]]],
+        case_sensitive: bool = True) -> Callable[[], Optional[str]]:
+    """
+    Create a lambda that validates environment variable against allowed choices
+    
+    Args:
+        env_name: Name of the environment variable
+        default: Default value if not set (can be None)
+        choices: List of valid string options or callable that returns list
+        case_sensitive: Whether validation should be case sensitive
+        
+    Returns:
+        Lambda function for environment_variables dict
+    """
+
+    def _get_validated_env() -> Optional[str]:
+        value = os.getenv(env_name)
+        if value is None:
+            return default
+
+        # Resolve choices if it's a callable (for lazy loading)
+        actual_choices = choices() if callable(choices) else choices
+
+        if not case_sensitive:
+            check_value = value.lower()
+            check_choices = [choice.lower() for choice in actual_choices]
+        else:
+            check_value = value
+            check_choices = actual_choices
+
+        if check_value not in check_choices:
+            raise ValueError(f"Invalid value '{value}' for {env_name}. "
+                             f"Valid options: {actual_choices}.")
+
+        return value
+
+    return _get_validated_env
+
+
 def get_vllm_port() -> Optional[int]:
     """Get the port from VLLM_PORT environment variable.
 
@@ -287,7 +334,8 @@ def get_vllm_port() -> Optional[int]:
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"
     "CMAKE_BUILD_TYPE":
-    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+    env_with_choices("CMAKE_BUILD_TYPE", None,
+        ["Debug", "Release", "RelWithDebInfo"]),
 
     # If set, vllm will print verbose logs during installation
     "VERBOSE":
@@ -476,7 +524,7 @@ def get_vllm_port() -> Optional[int]:
     lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
 
     # Backend for attention computation
-    # Available options:
+    # Example options:
     # - "TORCH_SDPA": use torch.nn.MultiheadAttention
     # - "FLASH_ATTN": use FlashAttention
     # - "XFORMERS": use XFormers
@@ -486,8 +534,11 @@ def get_vllm_port() -> Optional[int]:
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
     # - "FLASHINFER_MLA": use FlashInfer for MLA
     # - "CUTLASS_MLA": use CUTLASS for MLA
+    # All possible options loaded dynamically from _Backend enum
     "VLLM_ATTENTION_BACKEND":
-    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
+    env_with_choices("VLLM_ATTENTION_BACKEND", None,
+                     lambda: list(__import__('vllm.platforms.interface', \
+                        fromlist=['_Backend'])._Backend.__members__.keys())),
 
     # If set, vllm will use flashinfer sampler
     "VLLM_USE_FLASHINFER_SAMPLER":
@@ -550,7 +601,8 @@ def get_vllm_port() -> Optional[int]:
     # - "shm": use shared memory and gRPC for communication
     # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
     "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
-    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
+    env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto",
+        ["auto", "nccl", "shm"]),
 
     # If the env var is set, it enables GPU communication overlap
     # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
@@ -569,7 +621,8 @@ def get_vllm_port() -> Optional[int]:
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
+    env_with_choices("VLLM_WORKER_MULTIPROC_METHOD", "fork",
+       ["spawn", "fork"]),
 
     # Path to the cache for storing downloaded assets
     "VLLM_ASSETS_CACHE":
@@ -833,7 +886,8 @@ def get_vllm_port() -> Optional[int]:
     # Choice of quantization level: FP, INT8, INT6, INT4 or NONE
     # Recommended for large models to get allreduce
     "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
-    lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(),
+    env_with_choices("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE",
+                            ["FP", "INT8", "INT6", "INT4", "NONE"]),
 
     # Custom quick allreduce kernel for MI3* cards
     # Due to the lack of the bfloat16 asm instruction, bfloat16
@@ -1075,21 +1129,20 @@ def get_vllm_port() -> Optional[int]:
     # - "deepep_high_throughput", use deepep high-throughput kernels
     # - "deepep_low_latency", use deepep low-latency kernels
     "VLLM_ALL2ALL_BACKEND":
-    lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
+    env_with_choices("VLLM_ALL2ALL_BACKEND", "naive",
+                     ["naive", "pplx",
+                     "deepep_high_throughput", "deepep_low_latency"]),
 
-    # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
-    # require compute capability 10.0 or above.
+    # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
+    # Both require compute capability 10.0 or above.
     # Available options:
     # - "throughput":  [default]
     #     Uses CUTLASS kernels optimized for high-throughput batch inference.
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
-    # To set this backend, define the environment variable:
-    #     export VLLM_FLASHINFER_MOE_BACKEND=latency.
-    # If not set, defaults to "throughput".
-    "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv(
-    "VLLM_FLASHINFER_MOE_BACKEND", "throughput"
-    ),
+    "VLLM_FLASHINFER_MOE_BACKEND":
+    env_with_choices("VLLM_FLASHINFER_MOE_BACKEND", "throughput",
+    ["throughput", "latency"]),
 
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
@@ -1145,7 +1198,7 @@ def get_vllm_port() -> Optional[int]:
     # leave the layout choice to the backend. Mind that backends may only
     # implement and support a subset of all possible layouts.
     "VLLM_KV_CACHE_LAYOUT":
-    lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None),
+    env_with_choices("VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]),
 
     # Enable checking whether the generated logits contain NaNs,
     # indicating corrupted output. Useful for debugging low level bugs