perf: move VLLM engine initialization from __init__ to an @Enter() hook

OpenRouterTeam · Jan 11, 2024 · 0c3a991 · 0c3a991
1 parent 40bcca8
commit 0c3a991
Showing 1 changed file with 10 additions and 3 deletions.
diff --git a/modal/runner/engines/vllm.py b/modal/runner/engines/vllm.py
@@ -1,6 +1,6 @@
 from typing import Optional
 
-from modal import method
+from modal import enter, method
 from pydantic import BaseModel
 
 from shared.protocol import (
@@ -44,12 +44,17 @@ def __init__(self, params: VllmParams):
         from vllm.engine.arg_utils import AsyncEngineArgs
         from vllm.engine.async_llm_engine import AsyncLLMEngine
 
-        engine_args = AsyncEngineArgs(
+        self.engine_args = AsyncEngineArgs(
             **params.dict(),
             disable_log_requests=True,
         )
+        self.engine: AsyncLLMEngine | None = None
 
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+    @enter()
+    def start(self):
+        from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+        self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
 
     # @method()
     # async def tokenize_prompt(self, payload: Payload) -> List[int]:
@@ -62,6 +67,8 @@ def __init__(self, params: VllmParams):
 
     @method()
     async def generate(self, payload: CompletionPayload, params):
+        assert self.engine is not None, "Engine not initialized"
+
         try:
             import time