From d70a3541970a004a13306b83c4ea86e23724ece8 Mon Sep 17 00:00:00 2001 From: Jens Glaser Date: Wed, 2 Apr 2025 23:22:48 -0400 Subject: [PATCH 1/5] Use custom address for listening socket This allows listening on a specified interface instead of 0.0.0.0, which may save OS resources Signed-off-by: Jens Glaser --- vllm/distributed/utils.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index cae1a25519b3..74d90e356ef4 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -8,6 +8,7 @@ import datetime import pickle import time +import socket from collections import deque from typing import Any, Deque, Dict, Optional, Sequence, Tuple @@ -122,6 +123,7 @@ class StatelessProcessGroup: rank: int world_size: int store: torch._C._distributed_c10d.Store + socket: socket.socket # so that file descriptor remains available data_expiration_seconds: int = 3600 # 1 hour # dst rank -> counter @@ -233,18 +235,33 @@ def create( can call `StatelessProcessGroup.create` to form a group, and then process A, B, C, and D can call `StatelessProcessGroup.create` to form another group. """ # noqa + launch_server = rank == 0 + if launch_server: + # listen on the specified interface (instead of 0.0.0.0) + listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + listen_socket.bind((host, port)) + listen_socket.listen() + listen_fd = listen_socket.fileno() + else: + listen_socket = None + listen_fd = None + store = TCPStore( host_name=host, port=port, world_size=world_size, - is_master=(rank == 0), + is_master=launch_server, timeout=datetime.timedelta(seconds=store_timeout), + use_libuv=False, # could be enabled with recent torch versions, see https://github.com/pytorch/pytorch/pull/150215 + master_listen_fd=listen_fd, ) return StatelessProcessGroup( rank=rank, world_size=world_size, store=store, + socket=listen_socket, data_expiration_seconds=data_expiration_seconds) From 08fe582a761469b0f3be14c90bc680646e45dfd5 Mon Sep 17 00:00:00 2001 From: Jens Glaser Date: Thu, 3 Apr 2025 00:17:59 -0400 Subject: [PATCH 2/5] Fix typing error Signed-off-by: Jens Glaser --- vllm/distributed/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 74d90e356ef4..f33ed8f7b7cb 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -123,7 +123,10 @@ class StatelessProcessGroup: rank: int world_size: int store: torch._C._distributed_c10d.Store - socket: socket.socket # so that file descriptor remains available + + # stores a reference to the socket so that the file descriptor remains available + socket: Optional[socket.socket] + data_expiration_seconds: int = 3600 # 1 hour # dst rank -> counter From 5d024b14659ba2190bb3657fe7889304e7d3eb89 Mon Sep 17 00:00:00 2001 From: Jens Glaser Date: Thu, 3 Apr 2025 11:41:52 -0400 Subject: [PATCH 3/5] Fix too long line Signed-off-by: Jens Glaser --- vllm/distributed/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index f33ed8f7b7cb..ee24f4ee5474 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -124,7 +124,7 @@ class StatelessProcessGroup: world_size: int store: torch._C._distributed_c10d.Store - # stores a reference to the socket so that the file descriptor remains available + # stores a reference to the socket so that the file descriptor stays alive socket: Optional[socket.socket] data_expiration_seconds: int = 3600 # 1 hour From 73ad6ca1fc9f9261da61d941a606aab3ed5f2eee Mon Sep 17 00:00:00 2001 From: Jens Glaser Date: Thu, 3 Apr 2025 11:49:58 -0400 Subject: [PATCH 4/5] fix another long comment Signed-off-by: Jens Glaser --- vllm/distributed/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index ee24f4ee5474..0d51c013e507 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -256,7 +256,7 @@ def create( world_size=world_size, is_master=launch_server, timeout=datetime.timedelta(seconds=store_timeout), - use_libuv=False, # could be enabled with recent torch versions, see https://github.com/pytorch/pytorch/pull/150215 + use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215 master_listen_fd=listen_fd, ) From eb02f3729b5433fc259e52e5e13d6605a2b7a870 Mon Sep 17 00:00:00 2001 From: Jens Glaser Date: Thu, 3 Apr 2025 11:58:24 -0400 Subject: [PATCH 5/5] fix remaining pre-commit errors Signed-off-by: Jens Glaser --- vllm/distributed/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 0d51c013e507..aad854e914b3 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -7,8 +7,8 @@ import dataclasses import datetime import pickle -import time import socket +import time from collections import deque from typing import Any, Deque, Dict, Optional, Sequence, Tuple @@ -256,7 +256,7 @@ def create( world_size=world_size, is_master=launch_server, timeout=datetime.timedelta(seconds=store_timeout), - use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215 + use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215 master_listen_fd=listen_fd, )