Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 77 additions & 24 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os
import sys
import tempfile
from typing import TYPE_CHECKING, Any, Callable, Optional
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union

if TYPE_CHECKING:
VLLM_HOST_IP: str = ""
Expand Down Expand Up @@ -56,11 +56,12 @@
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
VLLM_USE_RAY_SPMD_WORKER: bool = False
VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl",
"shm"] = "auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
VLLM_XLA_USE_SPMD: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = "fork"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
Expand All @@ -77,7 +78,8 @@
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
CMAKE_BUILD_TYPE: Optional[str] = None
CMAKE_BUILD_TYPE: Optional[Literal["Debug", "Release",
"RelWithDebInfo"]] = None
VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_RPC_TIMEOUT: int = 10000 # ms
Expand Down Expand Up @@ -140,22 +142,25 @@
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput",
"latency"] = "throughput"
VLLM_XGRAMMAR_CACHE_MB: int = 0
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
VLLM_ALL2ALL_BACKEND: str = "naive"
VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput",
"deepep_low_latency"] = "naive"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_SLEEP_WHEN_IDLE: bool = False
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal["FP", "INT8", "INT6", "INT4",
"NONE"] = "NONE"
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
Expand Down Expand Up @@ -207,6 +212,48 @@ def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
return bool(int(value))


def env_with_choices(
env_name: str,
default: Optional[str],
choices: Union[list[str], Callable[[], list[str]]],
case_sensitive: bool = True) -> Callable[[], Optional[str]]:
"""
Create a lambda that validates environment variable against allowed choices

Args:
env_name: Name of the environment variable
default: Default value if not set (can be None)
choices: List of valid string options or callable that returns list
case_sensitive: Whether validation should be case sensitive

Returns:
Lambda function for environment_variables dict
"""

def _get_validated_env() -> Optional[str]:
value = os.getenv(env_name)
if value is None:
return default

# Resolve choices if it's a callable (for lazy loading)
actual_choices = choices() if callable(choices) else choices

if not case_sensitive:
check_value = value.lower()
check_choices = [choice.lower() for choice in actual_choices]
else:
check_value = value
check_choices = actual_choices

if check_value not in check_choices:
raise ValueError(f"Invalid value '{value}' for {env_name}. "
f"Valid options: {actual_choices}.")

return value

return _get_validated_env


def get_vllm_port() -> Optional[int]:
"""Get the port from VLLM_PORT environment variable.

Expand Down Expand Up @@ -287,7 +334,8 @@ def get_vllm_port() -> Optional[int]:
# If not set, defaults to "Debug" or "RelWithDebInfo"
# Available options: "Debug", "Release", "RelWithDebInfo"
"CMAKE_BUILD_TYPE":
lambda: os.getenv("CMAKE_BUILD_TYPE"),
env_with_choices("CMAKE_BUILD_TYPE", None,
["Debug", "Release", "RelWithDebInfo"]),

# If set, vllm will print verbose logs during installation
"VERBOSE":
Expand Down Expand Up @@ -476,7 +524,7 @@ def get_vllm_port() -> Optional[int]:
lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),

# Backend for attention computation
# Available options:
# Example options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers
Expand All @@ -486,8 +534,11 @@ def get_vllm_port() -> Optional[int]:
# - "FLASH_ATTN_MLA": use FlashAttention for MLA
# - "FLASHINFER_MLA": use FlashInfer for MLA
# - "CUTLASS_MLA": use CUTLASS for MLA
# All possible options loaded dynamically from _Backend enum
"VLLM_ATTENTION_BACKEND":
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
env_with_choices("VLLM_ATTENTION_BACKEND", None,
lambda: list(__import__('vllm.platforms.interface', \
fromlist=['_Backend'])._Backend.__members__.keys())),

# If set, vllm will use flashinfer sampler
"VLLM_USE_FLASHINFER_SAMPLER":
Expand Down Expand Up @@ -550,7 +601,8 @@ def get_vllm_port() -> Optional[int]:
# - "shm": use shared memory and gRPC for communication
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto",
["auto", "nccl", "shm"]),

# If the env var is set, it enables GPU communication overlap
# (experimental feature) in Ray's Compiled Graph. This flag is ignored if
Expand All @@ -569,7 +621,8 @@ def get_vllm_port() -> Optional[int]:
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD":
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
env_with_choices("VLLM_WORKER_MULTIPROC_METHOD", "fork",
["spawn", "fork"]),

# Path to the cache for storing downloaded assets
"VLLM_ASSETS_CACHE":
Expand Down Expand Up @@ -833,7 +886,8 @@ def get_vllm_port() -> Optional[int]:
# Choice of quantization level: FP, INT8, INT6, INT4 or NONE
# Recommended for large models to get allreduce
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(),
env_with_choices("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE",
["FP", "INT8", "INT6", "INT4", "NONE"]),

# Custom quick allreduce kernel for MI3* cards
# Due to the lack of the bfloat16 asm instruction, bfloat16
Expand Down Expand Up @@ -1075,21 +1129,20 @@ def get_vllm_port() -> Optional[int]:
# - "deepep_high_throughput", use deepep high-throughput kernels
# - "deepep_low_latency", use deepep low-latency kernels
"VLLM_ALL2ALL_BACKEND":
lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
env_with_choices("VLLM_ALL2ALL_BACKEND", "naive",
["naive", "pplx",
"deepep_high_throughput", "deepep_low_latency"]),

# Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
# require compute capability 10.0 or above.
# Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
# Both require compute capability 10.0 or above.
# Available options:
# - "throughput": [default]
# Uses CUTLASS kernels optimized for high-throughput batch inference.
# - "latency":
# Uses TensorRT-LLM kernels optimized for low-latency inference.
# To set this backend, define the environment variable:
# export VLLM_FLASHINFER_MOE_BACKEND=latency.
# If not set, defaults to "throughput".
"VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv(
"VLLM_FLASHINFER_MOE_BACKEND", "throughput"
),
"VLLM_FLASHINFER_MOE_BACKEND":
env_with_choices("VLLM_FLASHINFER_MOE_BACKEND", "throughput",
["throughput", "latency"]),

# Control the maximum number of tokens per expert supported by the
# NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
Expand Down Expand Up @@ -1145,7 +1198,7 @@ def get_vllm_port() -> Optional[int]:
# leave the layout choice to the backend. Mind that backends may only
# implement and support a subset of all possible layouts.
"VLLM_KV_CACHE_LAYOUT":
lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None),
env_with_choices("VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]),

# Enable checking whether the generated logits contain NaNs,
# indicating corrupted output. Useful for debugging low level bugs
Expand Down