Initialize vLLM config in init_app_state, if envs.VLLM_SERVER_DEV_MODE is 1, then add "/server_info" endpoint in api_server.

Cangxihui · Cangxihui · commit 41973c3ca265 · 2025-04-14T21:31:34.000+08:00
Signed-off-by: Xihui Cang &lt;xihuicang@gmail.com&gt;
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -1171,6 +1171,10 @@ async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         return self.engine.get_model_config()
 
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        return self.engine.get_vllm_config()
+
     async def get_parallel_config(self) -> ParallelConfig:
         """Get the parallel configuration of the vLLM engine."""
         return self.engine.get_parallel_config()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -918,6 +918,10 @@ def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
         return self.model_config
 
+    def get_vllm_config(self) -> VllmConfig:
+        """Gets the vllm configuration."""
+        return self.vllm_config
+
     def get_parallel_config(self) -> ParallelConfig:
         """Gets the parallel configuration."""
         return self.parallel_config
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -93,6 +93,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         self._errored_with: Optional[BaseException] = None
 
         # Get the configs.
+        self.vllm_config = engine_config
         self.model_config = engine_config.model_config
         self.decoding_config = engine_config.decoding_config
 
@@ -383,6 +384,9 @@ async def get_decoding_config(self) -> DecodingConfig:
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def is_tracing_enabled(self) -> bool:
         return self.tracing_flag
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -2,7 +2,6 @@
 
 import asyncio
 import atexit
-import dataclasses
 import gc
 import importlib
 import inspect
@@ -105,20 +104,6 @@
 _running_tasks: set[asyncio.Task] = set()
 
 
-# Store global states
-@dataclasses.dataclass
-class _GlobalState:
-    vllmconfig: VllmConfig
-
-
-_global_state: Optional[_GlobalState] = None
-
-
-def set_global_state(global_state: _GlobalState):
-    global _global_state
-    _global_state = global_state
-
-
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
@@ -180,7 +165,6 @@ async def build_async_engine_client_from_engine_args(
     usage_context = UsageContext.OPENAI_API_SERVER
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
-    set_global_state(_GlobalState(vllmconfig=vllm_config))
     # V1 AsyncLLM.
     if envs.VLLM_USE_V1:
         if disable_frontend_multiprocessing:
@@ -744,13 +728,10 @@ async def is_sleeping(raw_request: Request):
         logger.info("check whether the engine is sleeping")
         is_sleeping = await engine_client(raw_request).is_sleeping()
         return JSONResponse(content={"is_sleeping": is_sleeping})
-    
+
     @router.get("/server_info")
-    async def show_server_info():
-        if _global_state is None:
-            server_info = {"vllm_config": "Vllm Config not available"}
-        else:
-            server_info = {"vllm_config": str(_global_state.vllmconfig)}
+    async def show_server_info(raw_request: Request):
+        server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
         return JSONResponse(content=server_info)
 
 
@@ -919,7 +900,7 @@ async def log_response(request: Request, call_next):
 
 async def init_app_state(
     engine_client: EngineClient,
-    model_config: ModelConfig,
+    vllm_config: VllmConfig,
     state: State,
     args: Namespace,
 ) -> None:
@@ -940,6 +921,8 @@ async def init_app_state(
 
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
+    state.vllm_config = vllm_config
+    model_config = vllm_config.model_config
 
     resolved_chat_template = load_chat_template(args.chat_template)
     if resolved_chat_template is not None:
@@ -1094,8 +1077,8 @@ def signal_handler(*_) -> None:
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
-        model_config = await engine_client.get_model_config()
-        await init_app_state(engine_client, model_config, app.state, args)
+        vllm_config = await engine_client.get_vllm_config()
+        await init_app_state(engine_client, vllm_config, app.state, args)
 
         def _listen_addr(a: str) -> str:
             if is_valid_ipv6_address(a):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -64,7 +64,7 @@ def __init__(
         assert start_engine_loop
 
         self.model_config = vllm_config.model_config
-
+        self.vllm_config = vllm_config
         self.log_requests = log_requests
         self.log_stats = log_stats
 
@@ -379,6 +379,9 @@ def encode(
     ):
         raise ValueError("Not Supported on V1 yet.")
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def get_model_config(self) -> ModelConfig:
         return self.model_config