@@ -104,8 +104,12 @@ def _init_executor(self) -> None:
104104 finally :
105105 if not success :
106106 # Clean up the worker procs if there was a failure.
107+ # Close death_writers first to signal workers to exit
108+ for uw in unready_workers :
109+ if uw .death_writer is not None :
110+ uw .death_writer .close ()
107111 self ._ensure_worker_termination (
108- [w .proc for w in unready_workers ])
112+ [uw .proc for uw in unready_workers ])
109113
110114 # For pipeline parallel, we use a thread pool for asynchronous
111115 # execute_model.
@@ -282,6 +286,10 @@ def shutdown(self):
282286
283287 if workers := getattr (self , 'workers' , None ):
284288 for w in workers :
289+ # Close death_writer to signal child processes to exit
290+ if w .death_writer is not None :
291+ w .death_writer .close ()
292+ w .death_writer = None
285293 w .worker_response_mq = None
286294 self ._ensure_worker_termination ([w .proc for w in workers ])
287295
@@ -316,13 +324,15 @@ class UnreadyWorkerProcHandle:
316324 proc : BaseProcess
317325 rank : int
318326 ready_pipe : Connection
327+ death_writer : Optional [Connection ] = None
319328
320329
321330@dataclass
322331class WorkerProcHandle :
323332 proc : BaseProcess
324333 rank : int
325334 worker_response_mq : MessageQueue # The worker process writes to this MQ
335+ death_writer : Optional [Connection ] = None
326336
327337 @classmethod
328338 def from_unready_handle (
@@ -332,6 +342,7 @@ def from_unready_handle(
332342 proc = unready_handle .proc ,
333343 rank = unready_handle .rank ,
334344 worker_response_mq = worker_response_mq ,
345+ death_writer = unready_handle .death_writer ,
335346 )
336347
337348
@@ -396,13 +407,17 @@ def make_worker_process(
396407 # (reader, writer)
397408 reader , writer = context .Pipe (duplex = False )
398409
410+ # Create death pipe to detect parent process exit
411+ death_reader , death_writer = context .Pipe (duplex = False )
412+
399413 process_kwargs = {
400414 "vllm_config" : vllm_config ,
401415 "local_rank" : local_rank ,
402416 "rank" : rank ,
403417 "distributed_init_method" : distributed_init_method ,
404418 "input_shm_handle" : input_shm_handle ,
405419 "ready_pipe" : (reader , writer ),
420+ "death_pipe" : death_reader ,
406421 }
407422 # Run EngineCore busy loop in background process.
408423 proc = context .Process (target = WorkerProc .worker_main ,
@@ -412,7 +427,9 @@ def make_worker_process(
412427
413428 proc .start ()
414429 writer .close ()
415- return UnreadyWorkerProcHandle (proc , rank , reader )
430+ # Keep death_writer open in parent - when parent exits,
431+ # death_reader in child will get EOFError
432+ return UnreadyWorkerProcHandle (proc , rank , reader , death_writer )
416433
417434 @staticmethod
418435 def wait_for_ready (
@@ -483,6 +500,28 @@ def signal_handler(signum, frame):
483500 worker = None
484501 # tuple[Connection, Connection]
485502 reader , ready_writer = kwargs .pop ("ready_pipe" )
503+ death_pipe = kwargs .pop ("death_pipe" , None )
504+
505+ # Start death monitoring thread if death_pipe is provided
506+ if death_pipe is not None :
507+
508+ def monitor_parent_death ():
509+ try :
510+ # This will block until parent process exits (pipe closes)
511+ death_pipe .recv ()
512+ except EOFError :
513+ # Parent process has exited, terminate this worker
514+ logger .info ("Parent process exited, terminating worker" )
515+ # Send signal to self to trigger clean shutdown
516+ os .kill (os .getpid (), signal .SIGTERM )
517+ except Exception as e :
518+ logger .warning ("Death monitoring error: %s" , e )
519+
520+ death_monitor = Thread (target = monitor_parent_death ,
521+ daemon = True ,
522+ name = "WorkerDeathMonitor" )
523+ death_monitor .start ()
524+
486525 try :
487526 reader .close ()
488527 worker = WorkerProc (* args , ** kwargs )
@@ -523,6 +562,8 @@ def signal_handler(signum, frame):
523562 finally :
524563 if ready_writer is not None :
525564 ready_writer .close ()
565+ if death_pipe is not None :
566+ death_pipe .close ()
526567 # Clean up once worker exits busy loop
527568 if worker is not None :
528569 worker .shutdown ()
0 commit comments