From d70a3541970a004a13306b83c4ea86e23724ece8 Mon Sep 17 00:00:00 2001
From: Jens Glaser <glaserj@ornl.gov>
Date: Wed, 2 Apr 2025 23:22:48 -0400
Subject: [PATCH 1/5] Use custom address for listening socket

This allows listening on a specified interface instead
of 0.0.0.0, which may save OS resources

Signed-off-by: Jens Glaser <glaserj@ornl.gov>
---
 vllm/distributed/utils.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index cae1a25519b3..74d90e356ef4 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -8,6 +8,7 @@
 import datetime
 import pickle
 import time
+import socket
 from collections import deque
 from typing import Any, Deque, Dict, Optional, Sequence, Tuple
 
@@ -122,6 +123,7 @@ class StatelessProcessGroup:
     rank: int
     world_size: int
     store: torch._C._distributed_c10d.Store
+    socket: socket.socket # so that file descriptor remains available
     data_expiration_seconds: int = 3600  # 1 hour
 
     # dst rank -> counter
@@ -233,18 +235,33 @@ def create(
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
         """ # noqa
+        launch_server = rank == 0
+        if launch_server:
+            # listen on the specified interface (instead of 0.0.0.0)
+            listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            listen_socket.bind((host, port))
+            listen_socket.listen()
+            listen_fd = listen_socket.fileno()
+        else:
+            listen_socket = None
+            listen_fd = None
+
         store = TCPStore(
             host_name=host,
             port=port,
             world_size=world_size,
-            is_master=(rank == 0),
+            is_master=launch_server,
             timeout=datetime.timedelta(seconds=store_timeout),
+            use_libuv=False, # could be enabled with recent torch versions, see https://github.com/pytorch/pytorch/pull/150215
+            master_listen_fd=listen_fd,
         )
 
         return StatelessProcessGroup(
             rank=rank,
             world_size=world_size,
             store=store,
+            socket=listen_socket,
             data_expiration_seconds=data_expiration_seconds)
 
 

From 08fe582a761469b0f3be14c90bc680646e45dfd5 Mon Sep 17 00:00:00 2001
From: Jens Glaser <glaserj@ornl.gov>
Date: Thu, 3 Apr 2025 00:17:59 -0400
Subject: [PATCH 2/5] Fix typing error

Signed-off-by: Jens Glaser <glaserj@ornl.gov>
---
 vllm/distributed/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 74d90e356ef4..f33ed8f7b7cb 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -123,7 +123,10 @@ class StatelessProcessGroup:
     rank: int
     world_size: int
     store: torch._C._distributed_c10d.Store
-    socket: socket.socket # so that file descriptor remains available
+
+    # stores a reference to the socket so that the file descriptor remains available
+    socket: Optional[socket.socket]
+
     data_expiration_seconds: int = 3600  # 1 hour
 
     # dst rank -> counter

From 5d024b14659ba2190bb3657fe7889304e7d3eb89 Mon Sep 17 00:00:00 2001
From: Jens Glaser <glaserj@ornl.gov>
Date: Thu, 3 Apr 2025 11:41:52 -0400
Subject: [PATCH 3/5] Fix too long line

Signed-off-by: Jens Glaser <glaserj@ornl.gov>
---
 vllm/distributed/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index f33ed8f7b7cb..ee24f4ee5474 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -124,7 +124,7 @@ class StatelessProcessGroup:
     world_size: int
     store: torch._C._distributed_c10d.Store
 
-    # stores a reference to the socket so that the file descriptor remains available
+    # stores a reference to the socket so that the file descriptor stays alive
     socket: Optional[socket.socket]
 
     data_expiration_seconds: int = 3600  # 1 hour

From 73ad6ca1fc9f9261da61d941a606aab3ed5f2eee Mon Sep 17 00:00:00 2001
From: Jens Glaser <glaserj@ornl.gov>
Date: Thu, 3 Apr 2025 11:49:58 -0400
Subject: [PATCH 4/5] fix another long comment

Signed-off-by: Jens Glaser <glaserj@ornl.gov>
---
 vllm/distributed/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index ee24f4ee5474..0d51c013e507 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -256,7 +256,7 @@ def create(
             world_size=world_size,
             is_master=launch_server,
             timeout=datetime.timedelta(seconds=store_timeout),
-            use_libuv=False, # could be enabled with recent torch versions, see https://github.com/pytorch/pytorch/pull/150215
+            use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215
             master_listen_fd=listen_fd,
         )
 

From eb02f3729b5433fc259e52e5e13d6605a2b7a870 Mon Sep 17 00:00:00 2001
From: Jens Glaser <glaserj@ornl.gov>
Date: Thu, 3 Apr 2025 11:58:24 -0400
Subject: [PATCH 5/5] fix remaining pre-commit errors

Signed-off-by: Jens Glaser <glaserj@ornl.gov>
---
 vllm/distributed/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 0d51c013e507..aad854e914b3 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -7,8 +7,8 @@
 import dataclasses
 import datetime
 import pickle
-import time
 import socket
+import time
 from collections import deque
 from typing import Any, Deque, Dict, Optional, Sequence, Tuple
 
@@ -256,7 +256,7 @@ def create(
             world_size=world_size,
             is_master=launch_server,
             timeout=datetime.timedelta(seconds=store_timeout),
-            use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215
+            use_libuv=False,  # for now: github.com/pytorch/pytorch/pull/150215
             master_listen_fd=listen_fd,
         )