diff --git a/vllm/envs.py b/vllm/envs.py index d2006979ea81..385d2a7c51f2 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -6,7 +6,7 @@ import os import sys import tempfile -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union if TYPE_CHECKING: VLLM_HOST_IP: str = "" @@ -56,11 +56,12 @@ VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True VLLM_USE_RAY_SPMD_WORKER: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False - VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" + VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", + "shm"] = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True VLLM_XLA_USE_SPMD: bool = False - VLLM_WORKER_MULTIPROC_METHOD: str = "fork" + VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_VIDEO_FETCH_TIMEOUT: int = 30 @@ -77,7 +78,8 @@ VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False - CMAKE_BUILD_TYPE: Optional[str] = None + CMAKE_BUILD_TYPE: Optional[Literal["Debug", "Release", + "RelWithDebInfo"]] = None VERBOSE: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_RPC_TIMEOUT: int = 10000 # ms @@ -140,22 +142,25 @@ VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False - VLLM_FLASHINFER_MOE_BACKEND: str = "throughput" + VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", + "latency"] = "throughput" VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 - VLLM_ALL2ALL_BACKEND: str = "naive" + VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput", + "deepep_low_latency"] = "naive" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_SLEEP_WHEN_IDLE: bool = False VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16 VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300 - VLLM_KV_CACHE_LAYOUT: Optional[str] = None + VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None VLLM_COMPUTE_NANS_IN_LOGITS: bool = False VLLM_USE_NVFP4_CT_EMULATIONS: bool = False - VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE" + VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal["FP", "INT8", "INT6", "INT4", + "NONE"] = "NONE" VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120 @@ -207,6 +212,48 @@ def maybe_convert_bool(value: Optional[str]) -> Optional[bool]: return bool(int(value)) +def env_with_choices( + env_name: str, + default: Optional[str], + choices: Union[list[str], Callable[[], list[str]]], + case_sensitive: bool = True) -> Callable[[], Optional[str]]: + """ + Create a lambda that validates environment variable against allowed choices + + Args: + env_name: Name of the environment variable + default: Default value if not set (can be None) + choices: List of valid string options or callable that returns list + case_sensitive: Whether validation should be case sensitive + + Returns: + Lambda function for environment_variables dict + """ + + def _get_validated_env() -> Optional[str]: + value = os.getenv(env_name) + if value is None: + return default + + # Resolve choices if it's a callable (for lazy loading) + actual_choices = choices() if callable(choices) else choices + + if not case_sensitive: + check_value = value.lower() + check_choices = [choice.lower() for choice in actual_choices] + else: + check_value = value + check_choices = actual_choices + + if check_value not in check_choices: + raise ValueError(f"Invalid value '{value}' for {env_name}. " + f"Valid options: {actual_choices}.") + + return value + + return _get_validated_env + + def get_vllm_port() -> Optional[int]: """Get the port from VLLM_PORT environment variable. @@ -287,7 +334,8 @@ def get_vllm_port() -> Optional[int]: # If not set, defaults to "Debug" or "RelWithDebInfo" # Available options: "Debug", "Release", "RelWithDebInfo" "CMAKE_BUILD_TYPE": - lambda: os.getenv("CMAKE_BUILD_TYPE"), + env_with_choices("CMAKE_BUILD_TYPE", None, + ["Debug", "Release", "RelWithDebInfo"]), # If set, vllm will print verbose logs during installation "VERBOSE": @@ -476,7 +524,7 @@ def get_vllm_port() -> Optional[int]: lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), # Backend for attention computation - # Available options: + # Example options: # - "TORCH_SDPA": use torch.nn.MultiheadAttention # - "FLASH_ATTN": use FlashAttention # - "XFORMERS": use XFormers @@ -486,8 +534,11 @@ def get_vllm_port() -> Optional[int]: # - "FLASH_ATTN_MLA": use FlashAttention for MLA # - "FLASHINFER_MLA": use FlashInfer for MLA # - "CUTLASS_MLA": use CUTLASS for MLA + # All possible options loaded dynamically from _Backend enum "VLLM_ATTENTION_BACKEND": - lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), + env_with_choices("VLLM_ATTENTION_BACKEND", None, + lambda: list(__import__('vllm.platforms.interface', \ + fromlist=['_Backend'])._Backend.__members__.keys())), # If set, vllm will use flashinfer sampler "VLLM_USE_FLASHINFER_SAMPLER": @@ -550,7 +601,8 @@ def get_vllm_port() -> Optional[int]: # - "shm": use shared memory and gRPC for communication # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set. "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": - lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"), + env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto", + ["auto", "nccl", "shm"]), # If the env var is set, it enables GPU communication overlap # (experimental feature) in Ray's Compiled Graph. This flag is ignored if @@ -569,7 +621,8 @@ def get_vllm_port() -> Optional[int]: # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": - lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), + env_with_choices("VLLM_WORKER_MULTIPROC_METHOD", "fork", + ["spawn", "fork"]), # Path to the cache for storing downloaded assets "VLLM_ASSETS_CACHE": @@ -833,7 +886,8 @@ def get_vllm_port() -> Optional[int]: # Choice of quantization level: FP, INT8, INT6, INT4 or NONE # Recommended for large models to get allreduce "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": - lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(), + env_with_choices("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE", + ["FP", "INT8", "INT6", "INT4", "NONE"]), # Custom quick allreduce kernel for MI3* cards # Due to the lack of the bfloat16 asm instruction, bfloat16 @@ -1075,21 +1129,20 @@ def get_vllm_port() -> Optional[int]: # - "deepep_high_throughput", use deepep high-throughput kernels # - "deepep_low_latency", use deepep low-latency kernels "VLLM_ALL2ALL_BACKEND": - lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), + env_with_choices("VLLM_ALL2ALL_BACKEND", "naive", + ["naive", "pplx", + "deepep_high_throughput", "deepep_low_latency"]), - # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both - # require compute capability 10.0 or above. + # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. + # Both require compute capability 10.0 or above. # Available options: # - "throughput": [default] # Uses CUTLASS kernels optimized for high-throughput batch inference. # - "latency": # Uses TensorRT-LLM kernels optimized for low-latency inference. - # To set this backend, define the environment variable: - # export VLLM_FLASHINFER_MOE_BACKEND=latency. - # If not set, defaults to "throughput". - "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv( - "VLLM_FLASHINFER_MOE_BACKEND", "throughput" - ), + "VLLM_FLASHINFER_MOE_BACKEND": + env_with_choices("VLLM_FLASHINFER_MOE_BACKEND", "throughput", + ["throughput", "latency"]), # Control the maximum number of tokens per expert supported by the # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for @@ -1145,7 +1198,7 @@ def get_vllm_port() -> Optional[int]: # leave the layout choice to the backend. Mind that backends may only # implement and support a subset of all possible layouts. "VLLM_KV_CACHE_LAYOUT": - lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None), + env_with_choices("VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]), # Enable checking whether the generated logits contain NaNs, # indicating corrupted output. Useful for debugging low level bugs