Skip to content

Commit 77ec968

Browse files
JKSenthilfacebook-github-bot
authored andcommitted
fix timeout error on TCPStore creation (#147)
Summary: Pull Request resolved: #147 Async torchsnapshotting was causing a timeout error when creating the TCPStore within PendingSnapshot on an ondemand Using `torch.distributed.elastic.utils.distributed's` `get_socket_with_port()` resolves the issue. Reviewed By: daniellepintz Differential Revision: D48072665 fbshipit-source-id: a573a146f33ecec5f91ed800984e9b0f95cb2974
1 parent 71965ba commit 77ec968

File tree

1 file changed

+3
-5
lines changed

1 file changed

+3
-5
lines changed

torchsnapshot/dist_store.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8-
import socket
98
from datetime import timedelta
109
from typing import Dict, Optional
1110

1211
import torch.distributed as dist
12+
from torch.distributed.elastic.utils.distributed import get_socket_with_port
1313

1414
from .pg_wrapper import PGWrapper
1515

@@ -63,10 +63,8 @@ def create_store(pg_wrapper: PGWrapper) -> dist.Store:
6363
"""
6464
if pg_wrapper.get_rank() == 0:
6565
# Find a free port
66-
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
67-
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
68-
sock.bind((socket.gethostname(), 0))
69-
master_addr, master_port = sock.getsockname()
66+
sock = get_socket_with_port()
67+
master_addr, master_port, _, _ = sock.getsockname()
7068
sock.close()
7169
# Broadcast master address/port to peers
7270
obj_list = [master_addr, master_port]

0 commit comments

Comments
 (0)