Skip to content

Commit 727bd39

Browse files
tianhaodongbdYour Name
authored andcommitted
[Distributed] fix recreate nccl comm bug (#73625)
1 parent d5b8b45 commit 727bd39

File tree

3 files changed

+6
-4
lines changed

3 files changed

+6
-4
lines changed

paddle/fluid/distributed/collective/process_group_nccl.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,8 +991,8 @@ void ProcessGroupNCCL::Restart() {
991991
phi::distributed::P2POption p2p_opts = place_to_p2p_opts_.at(place_key);
992992
phi::distributed::CommContextManager::RecreateNCCLComm(
993993
store_, store_key, rank_, std::to_string(create_count_), &p2p_opts);
994-
create_count_++;
995994
}
995+
create_count_++;
996996
}
997997
phi::CUDAStream ProcessGroupNCCL::GetStream(const Place& place) {
998998
const auto& place_key = GetKeyFromPlace(place);

paddle/fluid/distributed/collective/process_group_nccl.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#pragma once
1616

1717
#include <chrono>
18+
#include <map>
1819
#include <memory>
1920
#include <string>
2021
#include <unordered_map>
@@ -285,7 +286,7 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
285286

286287
uint64_t comm_seq_{0};
287288
std::unordered_map<std::string, uint64_t> p2p_comm_seq_;
288-
std::unordered_map<std::string, std::string> place_to_group_key_;
289+
std::map<std::string, std::string> place_to_group_key_;
289290

290291
// TODO(sunyilun): attrs below will be removed later
291292
std::mutex mutex_;

paddle/phi/core/distributed/comm_context_manager.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ void CommContextManager::CreateNCCLCommContext(
131131
void CommContextManager::RecreateNCCLComm(const std::shared_ptr<Store>& store,
132132
const std::string& unique_comm_key,
133133
int rank,
134-
const std::string& hash_key,
134+
const std::string& recreate_key,
135135
const P2POption* p2p_opt) {
136136
auto& comm_context_manager = CommContextManager::GetInstance();
137137

@@ -140,7 +140,8 @@ void CommContextManager::RecreateNCCLComm(const std::shared_ptr<Store>& store,
140140
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
141141
}
142142

143-
std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key;
143+
std::string unique_key =
144+
"NCCLCommContext/" + unique_comm_key + "/" + recreate_key;
144145
if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
145146
std::vector<uint8_t> nccl_id_wrapper(
146147
reinterpret_cast<uint8_t*>(&nccl_id),

0 commit comments

Comments
 (0)