diff --git a/README.cvars b/README.cvars index 23c3f66f0..66ac6af1b 100644 --- a/README.cvars +++ b/README.cvars @@ -117,6 +117,13 @@ Description: Type: stringlist Default: None +NCCL_CTRAN_IB_TRAFFIC_PROFILNG +Description: + Enable IB transport traffic profiling. + Disabled by default. +Type: bool +Default: False + NCCL_CTRAN_IB_MAX_QPS Description: Maximum number of QPs to enable, so data can be split across diff --git a/src/ctran/backends/ib/ctranIb.cc b/src/ctran/backends/ib/ctranIb.cc index 7ac2fa925..70dc8a2d3 100644 --- a/src/ctran/backends/ib/ctranIb.cc +++ b/src/ctran/backends/ib/ctranIb.cc @@ -22,6 +22,13 @@ List of IB HCAs available for NCCL to use. (this needs to be renamed to NCCL_IB_HCA_LIST eventually) + - name : NCCL_CTRAN_IB_TRAFFIC_PROFILNG + type : bool + default : false + description : |- + Enable IB transport traffic profiling. + Disabled by default. + === END_NCCL_CVAR_INFO_BLOCK === */ @@ -107,6 +114,41 @@ ctranIbSingleton::~ctranIbSingleton() { for (auto context : this->contexts) { NCCLCHECKIGNORE(wrap_ibv_close_device(context)); } + + if (!NCCL_CTRAN_IB_TRAFFIC_PROFILNG) + return; + + this->trafficRecordMutex.lock(); + for (auto& it : this->trafficPerDevice) { + INFO(NCCL_INIT, "CTRAN-IB: [traffic profiling] device %s total traffic: %ld bytes", it.first.c_str(), it.second); + } + for (auto& it : this->trafficPerQP) { + INFO(NCCL_INIT, "CTRAN-IB: [traffic profiling] qp %d total traffic: %ld bytes", it.first, it.second); + } +} + +void ctranIbSingleton::recordCtxTraffic( + struct ibv_context* ctx, + size_t nbytes) { + if (!NCCL_CTRAN_IB_TRAFFIC_PROFILNG) + return; + std::lock_guard guard(this->trafficRecordMutex); + auto devName = std::string(ctx->device->name); + + if (this->trafficPerDevice.count(devName) == 0) { + this->trafficPerDevice[devName] = 0; + } + this->trafficPerDevice[devName] += nbytes; +} + +void ctranIbSingleton::recordQpTraffic(struct ibv_qp* qp, size_t nbytes) { + if (!NCCL_CTRAN_IB_TRAFFIC_PROFILNG) + return; + std::lock_guard guard(this->trafficRecordMutex); + if (this->trafficPerQP.count(qp->qp_num) == 0) { + this->trafficPerQP[qp->qp_num] = 0; + } + this->trafficPerQP[qp->qp_num] += nbytes; } ctranIb::ctranIb(ncclComm *comm) { @@ -120,7 +162,7 @@ ctranIb::ctranIb(ncclComm *comm) { this->pimpl->context = s.contexts[comm->cudaDev]; this->pimpl->pd = s.pds[comm->cudaDev]; this->pimpl->port = s.ports[comm->cudaDev]; - INFO(NCCL_INIT, "CTRAN-IB: using device %s, port %d", s.devNames[comm->cudaDev].c_str(), this->pimpl->port); + INFO(NCCL_INIT, "CTRAN-IB: using device %s, port %d commHash %lu", s.devNames[comm->cudaDev].c_str(), this->pimpl->port, comm->commHash); struct ibv_device_attr devAttr; NCCLCHECKIGNORE(wrap_ibv_query_device(this->pimpl->context, &devAttr)); diff --git a/src/ctran/backends/ib/ctranIbImpl.h b/src/ctran/backends/ib/ctranIbImpl.h index eeb60b3ec..f44a8929a 100644 --- a/src/ctran/backends/ib/ctranIbImpl.h +++ b/src/ctran/backends/ib/ctranIbImpl.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "ibvwrap.h" #include "bootstrap.h" @@ -42,8 +43,13 @@ class ctranIbSingleton { std::vector contexts; std::vector pds; std::vector devNames; + void recordCtxTraffic(struct ibv_context *ctx, size_t nbytes); + void recordQpTraffic(struct ibv_qp* qp, size_t nbytes); private: + std::unordered_map trafficPerDevice; + std::unordered_map trafficPerQP; + std::mutex trafficRecordMutex; ctranIbSingleton(); ~ctranIbSingleton(); }; diff --git a/src/ctran/backends/ib/ctranIbVc.cc b/src/ctran/backends/ib/ctranIbVc.cc index ba85df178..8a23999d1 100644 --- a/src/ctran/backends/ib/ctranIbVc.cc +++ b/src/ctran/backends/ib/ctranIbVc.cc @@ -1,6 +1,8 @@ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. #include +#include +#include #include #include #include @@ -343,6 +345,8 @@ ncclResult_t ctranIb::impl::vc::postPutMsg(const void *sbuf, void *dbuf, std::si } uint64_t offset = 0; + ctranIbSingleton& s = ctranIbSingleton::getInstance(); + s.recordCtxTraffic(this->context, len_); for (int i = 0; i < numQps; i++) { uint64_t len = len_ / numQps; @@ -386,6 +390,7 @@ ncclResult_t ctranIb::impl::vc::postPutMsg(const void *sbuf, void *dbuf, std::si wr.wr.rdma.remote_addr = reinterpret_cast(dbuf) + offset; wr.wr.rdma.rkey = rkey; + s.recordQpTraffic(this->dataQp[i], toSend); NCCLCHECKGOTO(wrap_ibv_post_send(this->dataQp[i], &wr, &badWr), res, exit); len -= toSend; diff --git a/src/include/nccl_cvars.h b/src/include/nccl_cvars.h index 28d408367..c39d23ff4 100644 --- a/src/include/nccl_cvars.h +++ b/src/include/nccl_cvars.h @@ -54,6 +54,8 @@ extern enum NCCL_SENDRECV_ALGO NCCL_SENDRECV_ALGO; extern std::set NCCL_IB_HCA; +extern bool NCCL_CTRAN_IB_TRAFFIC_PROFILNG; + extern int NCCL_CTRAN_IB_MAX_QPS; extern int NCCL_CTRAN_IB_QP_SCALING_THRESHOLD; diff --git a/src/misc/nccl_cvars.cc b/src/misc/nccl_cvars.cc index 6c428cf53..41c67715a 100644 --- a/src/misc/nccl_cvars.cc +++ b/src/misc/nccl_cvars.cc @@ -93,6 +93,8 @@ enum NCCL_SENDRECV_ALGO NCCL_SENDRECV_ALGO; std::set NCCL_IB_HCA; +bool NCCL_CTRAN_IB_TRAFFIC_PROFILNG; + int NCCL_CTRAN_IB_MAX_QPS; int NCCL_CTRAN_IB_QP_SCALING_THRESHOLD; @@ -126,6 +128,7 @@ void ncclCvarInit() { env.insert("NCCL_DDA_FORCE_P2P_ACCESS"); env.insert("NCCL_SENDRECV_ALGO"); env.insert("NCCL_IB_HCA"); + env.insert("NCCL_CTRAN_IB_TRAFFIC_PROFILNG"); env.insert("NCCL_CTRAN_IB_MAX_QPS"); env.insert("NCCL_CTRAN_IB_QP_SCALING_THRESHOLD"); env.insert("NCCL_CTRAN_PROFILING"); @@ -239,6 +242,8 @@ void ncclCvarInit() { NCCL_IB_HCA = env2strlist("NCCL_IB_HCA", nullptr); + NCCL_CTRAN_IB_TRAFFIC_PROFILNG = env2bool("NCCL_CTRAN_IB_TRAFFIC_PROFILNG", "False"); + NCCL_CTRAN_IB_MAX_QPS = env2int("NCCL_CTRAN_IB_MAX_QPS", "1"); NCCL_CTRAN_IB_QP_SCALING_THRESHOLD = env2int("NCCL_CTRAN_IB_QP_SCALING_THRESHOLD", "1048576");