From 4c5b50b48bb1c57c76288278e7a692664baaf69c Mon Sep 17 00:00:00 2001 From: Sylvain Didelot Date: Mon, 13 Nov 2023 14:11:54 +0000 Subject: [PATCH] prov/verbs: Async route resolution and non-blocking EP creation The patch implements the last changes to make the EP creation non-blocking. Address is now resolved asynchronously when connection is established, which does no longer block the caller. The connection initiator creates the QP when the route is resolved. As for the connection target, it creates the QP when the EP is enabled (same as it was before). Signed-off-by: Sylvain Didelot --- prov/verbs/src/verbs_cm.c | 12 ++++++++---- prov/verbs/src/verbs_ep.c | 18 ++++++++++-------- prov/verbs/src/verbs_init.c | 20 -------------------- 3 files changed, 18 insertions(+), 32 deletions(-) diff --git a/prov/verbs/src/verbs_cm.c b/prov/verbs/src/verbs_cm.c index fc6f62c3f72..037d2ee576f 100644 --- a/prov/verbs/src/verbs_cm.c +++ b/prov/verbs/src/verbs_cm.c @@ -190,11 +190,15 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); assert(ep->state == VRB_IDLE); - ep->state = VRB_RESOLVE_ROUTE; - ret = rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT); - if (ret) { + ep->state = VRB_RESOLVE_ADDR; + if (rdma_resolve_addr(ep->id, ep->info_attr.src_addr, + ep->info_attr.dest_addr, VERBS_RESOLVE_TIMEOUT)) { ret = -errno; - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_route"); + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr"); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "src addr", ep->info_attr.src_addr); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "dst addr", ep->info_attr.dest_addr); free(ep->cm_priv_data); ep->cm_priv_data = NULL; ep->state = VRB_IDLE; diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index f710f0cf320..2a7fdf86a3c 100644 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -1033,15 +1033,17 @@ static int vrb_ep_enable(struct fid_ep *ep_fid) return -FI_EINVAL; } - ret = rdma_create_qp(ep->id, domain->pd, &attr); - if (ret) { - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp"); - return -errno; - } + if (ep->state == VRB_REQ_RCVD) { + ret = rdma_create_qp(ep->id, domain->pd, &attr); + if (ret) { + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp"); + return -errno; + } - /* Allow shared XRC INI QP not controlled by RDMA CM - * to share same post functions as RC QP. */ - ep->ibv_qp = ep->id->qp; + /* Allow shared XRC INI QP not controlled by RDMA CM + * to share same post functions as RC QP. */ + ep->ibv_qp = ep->id->qp; + } break; case FI_EP_DGRAM: assert(domain); diff --git a/prov/verbs/src/verbs_init.c b/prov/verbs/src/verbs_init.c index 5b1c6b765fd..a9fb6a629df 100644 --- a/prov/verbs/src/verbs_init.c +++ b/prov/verbs/src/verbs_init.c @@ -338,29 +338,9 @@ int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps, goto err1; } - /* TODO convert this call to non-blocking (use event channel) as well: - * This may likely be needed for better scaling when running large - * MPI jobs. - * Making this non-blocking would mean we can't create QP at EP enable - * time. We need to wait for RDMA_CM_EVENT_ADDR_RESOLVED event before - * creating the QP using rdma_create_qp. It would also require a SW - * receive queue to store recvs posted by app after enabling the EP. - */ - if (rdma_resolve_addr(*id, rai->ai_src_addr, rai->ai_dst_addr, - VERBS_RESOLVE_TIMEOUT)) { - ret = -errno; - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr"); - ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, - "src addr", rai->ai_src_addr); - ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, - "dst addr", rai->ai_dst_addr); - goto err2; - } rdma_freeaddrinfo(rai); return 0; -err2: - rdma_destroy_id(*id); err1: rdma_freeaddrinfo(rai); return ret;