11# SPDX-License-Identifier: Apache-2.0
22
3- import pickle
43import queue
54import signal
65import threading
76import time
87from multiprocessing .connection import Connection
9- from typing import List , Tuple , Type
8+ from typing import Any , List , Tuple , Type
109
1110import psutil
1211import zmq
1918from vllm .utils import get_exception_traceback , zmq_socket_ctx
2019from vllm .v1 .core .kv_cache_utils import get_kv_cache_config
2120from vllm .v1 .core .scheduler import Scheduler
22- from vllm .v1 .engine import (EngineCoreOutputs , EngineCoreProfile ,
23- EngineCoreRequest , EngineCoreRequestType ,
24- EngineCoreRequestUnion , EngineCoreResetPrefixCache )
21+ from vllm .v1 .engine import (EngineCoreOutputs , EngineCoreRequest ,
22+ EngineCoreRequestType )
2523from vllm .v1 .engine .mm_input_mapper import MMInputMapperServer
2624from vllm .v1 .executor .abstract import Executor
2725from vllm .v1 .request import Request , RequestStatus
28- from vllm .v1 .serial_utils import MsgpackEncoder , PickleEncoder
26+ from vllm .v1 .serial_utils import MsgpackDecoder , MsgpackEncoder
2927from vllm .version import __version__ as VLLM_VERSION
3028
3129logger = init_logger (__name__ )
@@ -171,7 +169,8 @@ def __init__(
171169 # and to overlap some serialization/deserialization with the
172170 # model forward pass.
173171 # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
174- self .input_queue : queue .Queue [EngineCoreRequestUnion ] = queue .Queue ()
172+ self .input_queue : queue .Queue [Tuple [EngineCoreRequestType ,
173+ Any ]] = queue .Queue ()
175174 self .output_queue : queue .Queue [EngineCoreOutputs ] = queue .Queue ()
176175 threading .Thread (target = self .process_input_socket ,
177176 args = (input_path , ),
@@ -233,7 +232,7 @@ def run_busy_loop(self):
233232 while True :
234233 try :
235234 req = self .input_queue .get (timeout = POLLING_TIMEOUT_S )
236- self ._handle_client_request (req )
235+ self ._handle_client_request (* req )
237236 break
238237 except queue .Empty :
239238 logger .debug ("EngineCore busy loop waiting." )
@@ -243,59 +242,51 @@ def run_busy_loop(self):
243242 except BaseException :
244243 raise
245244
246- # 2) Handle any new client requests (Abort or Add) .
245+ # 2) Handle any new client requests.
247246 while not self .input_queue .empty ():
248247 req = self .input_queue .get_nowait ()
249- self ._handle_client_request (req )
248+ self ._handle_client_request (* req )
250249
251250 # 3) Step the engine core.
252251 outputs = self .step ()
253252
254253 # 5) Put EngineCoreOutputs into the output queue.
255254 self .output_queue .put_nowait (outputs )
256255
257- def _handle_client_request (self , request : EngineCoreRequestUnion ) -> None :
258- """Handle EngineCoreRequest or EngineCoreABORT from Client."""
256+ def _handle_client_request (self , request_type : EngineCoreRequestType ,
257+ request : Any ) -> None :
258+ """Dispatch request from client."""
259259
260- if isinstance ( request , EngineCoreRequest ) :
260+ if request_type == EngineCoreRequestType . ADD :
261261 self .add_request (request )
262- elif isinstance (request , EngineCoreProfile ):
263- self .model_executor .profile (request .is_start )
264- elif isinstance (request , EngineCoreResetPrefixCache ):
265- self .reset_prefix_cache ()
266- else :
267- # TODO: make an EngineCoreAbort wrapper
268- assert isinstance (request , list )
262+ elif request_type == EngineCoreRequestType .ABORT :
269263 self .abort_requests (request )
264+ elif request_type == EngineCoreRequestType .RESET_PREFIX_CACHE :
265+ self .reset_prefix_cache ()
266+ elif request_type == EngineCoreRequestType .PROFILE :
267+ self .model_executor .profile (request )
270268
271269 def process_input_socket (self , input_path : str ):
272270 """Input socket IO thread."""
273271
274272 # Msgpack serialization decoding.
275- decoder_add_req = PickleEncoder ( )
276- decoder_abort_req = PickleEncoder ()
273+ add_request_decoder = MsgpackDecoder ( EngineCoreRequest )
274+ generic_decoder = MsgpackDecoder ()
277275
278276 with zmq_socket_ctx (input_path , zmq .constants .PULL ) as socket :
279277 while True :
280278 # (RequestType, RequestData)
281279 type_frame , data_frame = socket .recv_multipart (copy = False )
282- request_type = type_frame .buffer
283- request_data = data_frame .buffer
280+ request_type = EngineCoreRequestType (bytes (type_frame .buffer ))
284281
285282 # Deserialize the request data.
286- if request_type == EngineCoreRequestType .ADD .value :
287- request = decoder_add_req .decode (request_data )
288- elif request_type == EngineCoreRequestType .ABORT .value :
289- request = decoder_abort_req .decode (request_data )
290- elif request_type in (
291- EngineCoreRequestType .PROFILE .value ,
292- EngineCoreRequestType .RESET_PREFIX_CACHE .value ):
293- request = pickle .loads (request_data )
294- else :
295- raise ValueError (f"Unknown RequestType: { request_type } " )
283+ decoder = add_request_decoder if (
284+ request_type
285+ == EngineCoreRequestType .ADD ) else generic_decoder
286+ request = decoder .decode (data_frame .buffer )
296287
297288 # Push to input queue for core busy loop.
298- self .input_queue .put_nowait (request )
289+ self .input_queue .put_nowait (( request_type , request ) )
299290
300291 def process_output_socket (self , output_path : str ):
301292 """Output socket IO thread."""
0 commit comments