From 82dbc0f3bceb4f22a8d9f32377a138bbffe8128e Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Wed, 25 Jun 2025 15:32:15 -0400 Subject: [PATCH 1/3] fix address already in use issue Signed-off-by: yewentao256 --- tests/kernels/moe/deepep_utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py index 117f1babdf62..e7c844de3b1f 100644 --- a/tests/kernels/moe/deepep_utils.py +++ b/tests/kernels/moe/deepep_utils.py @@ -4,7 +4,9 @@ """ import dataclasses import importlib +import socket import traceback +from contextlib import closing from typing import Callable, Optional import torch @@ -79,6 +81,13 @@ def _worker_parallel_launch( torch.distributed.destroy_process_group() +def find_free_port(): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + + def parallel_launch( world_size: int, worker: Callable[Concatenate[ProcessGroupInfo, P], None], @@ -92,7 +101,7 @@ def parallel_launch( world_size, world_size, 0, - "tcp://localhost:29500", + f"tcp://localhost:{find_free_port()}", worker, ) + args, nprocs=world_size, From 5c10f8c4a7278c4c35204689180f5e0c1ce8f3cb Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Wed, 25 Jun 2025 16:26:32 -0400 Subject: [PATCH 2/3] update through gemini Signed-off-by: yewentao256 --- tests/kernels/moe/deepep_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py index e7c844de3b1f..659a475f4e10 100644 --- a/tests/kernels/moe/deepep_utils.py +++ b/tests/kernels/moe/deepep_utils.py @@ -4,6 +4,7 @@ """ import dataclasses import importlib +import os import socket import traceback from contextlib import closing @@ -101,7 +102,7 @@ def parallel_launch( world_size, world_size, 0, - f"tcp://localhost:{find_free_port()}", + f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{find_free_port()}", worker, ) + args, nprocs=world_size, From 35d91eb715812941141f49cd66bc2cdfedd0537b Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Wed, 25 Jun 2025 17:11:02 -0400 Subject: [PATCH 3/3] move to utils Signed-off-by: yewentao256 --- tests/kernels/moe/deepep_utils.py | 11 ++--------- vllm/model_executor/layers/fused_moe/utils.py | 9 +++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py index 659a475f4e10..e4cd8386e102 100644 --- a/tests/kernels/moe/deepep_utils.py +++ b/tests/kernels/moe/deepep_utils.py @@ -5,9 +5,7 @@ import dataclasses import importlib import os -import socket import traceback -from contextlib import closing from typing import Callable, Optional import torch @@ -16,6 +14,8 @@ spawn) # pyright: ignore[reportPrivateImportUsage] from typing_extensions import Concatenate, ParamSpec +from vllm.model_executor.layers.fused_moe.utils import find_free_port + has_deep_ep = importlib.util.find_spec("deep_ep") is not None if has_deep_ep: from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 @@ -82,13 +82,6 @@ def _worker_parallel_launch( torch.distributed.destroy_process_group() -def find_free_port(): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.bind(('', 0)) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - return s.getsockname()[1] - - def parallel_launch( world_size: int, worker: Callable[Concatenate[ProcessGroupInfo, P], None], diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 692482c2ea69..8f3191db680f 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import socket +from contextlib import closing from math import prod from typing import Optional @@ -96,3 +98,10 @@ def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype) else: return m[idx, ...] + + +def find_free_port(): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] \ No newline at end of file