diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 704b6f45e30..328fd932d5c 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -139,9 +139,25 @@ static int efa_domain_init_qp_table(struct efa_domain *efa_domain) static int efa_domain_init_rdm(struct efa_domain *efa_domain, struct fi_info *info) { int err; + bool enable_shm = efa_env.enable_shm_transfer; + + /* App provided hints supercede environmental variables. + * + * Using the shm provider comes with some overheads, so avoid + * initializing the provider if the app provides a hint that it does not + * require node-local communication. We can still loopback over the EFA + * device in cases where the app violates the hint and continues + * communicating with node-local peers. + * + */ + if ((info->caps & FI_REMOTE_COMM) + /* but not local communication */ + && !(info->caps & FI_LOCAL_COMM)) { + enable_shm = false; + } efa_domain->shm_info = NULL; - if (efa_env.enable_shm_transfer) + if (enable_shm) efa_shm_info_create(info, &efa_domain->shm_info); else EFA_INFO(FI_LOG_CORE, "EFA will not use SHM for intranode communication because FI_EFA_ENABLE_SHM_TRANSFER=0\n"); diff --git a/prov/efa/src/rdm/efa_rdm_atomic.c b/prov/efa/src/rdm/efa_rdm_atomic.c index f0fe471e6b1..f1028cd9d35 100644 --- a/prov/efa/src/rdm/efa_rdm_atomic.c +++ b/prov/efa/src/rdm/efa_rdm_atomic.c @@ -236,7 +236,7 @@ efa_rdm_atomic_inject(struct fid_ep *ep, return err; peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (!(efa_rdm_ep_domain(efa_rdm_ep)->shm_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) remote_addr = 0; @@ -293,7 +293,7 @@ efa_rdm_atomic_writemsg(struct fid_ep *ep, peer = efa_rdm_ep_get_peer(efa_rdm_ep, msg->addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { efa_rdm_atomic_init_shm_msg(efa_rdm_ep, &shm_msg, msg, rma_iov, shm_desc); shm_msg.addr = peer->shm_fiaddr; return fi_atomicmsg(efa_rdm_ep->shm_ep, &shm_msg, flags); @@ -386,7 +386,7 @@ efa_rdm_atomic_readwritemsg(struct fid_ep *ep, peer = efa_rdm_ep_get_peer(efa_rdm_ep, msg->addr); assert(peer); - if (peer->is_local & efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { efa_rdm_atomic_init_shm_msg(efa_rdm_ep, &shm_msg, msg, shm_rma_iov, shm_desc); shm_msg.addr = peer->shm_fiaddr; efa_rdm_get_desc_for_shm(result_count, result_desc, shm_res_desc); @@ -498,7 +498,7 @@ efa_rdm_atomic_compwritemsg(struct fid_ep *ep, peer = efa_rdm_ep_get_peer(efa_rdm_ep, msg->addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { efa_rdm_atomic_init_shm_msg(efa_rdm_ep, &shm_msg, msg, shm_rma_iov, shm_desc); shm_msg.addr = peer->shm_fiaddr; efa_rdm_get_desc_for_shm(result_count, result_desc, shm_res_desc); diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 0dc70b2fe31..5578d9d2ab8 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -94,13 +94,19 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun ofi_genlock_lock(srx_ctx->lock); - if (cq->shm_cq) + if (cq->shm_cq) { fi_cq_read(cq->shm_cq, NULL, 0); - ret = ofi_cq_read_entries(&cq->util_cq, buf, count, src_addr); + /* + * fi_cq_read(cq->shm_cq, NULL, 0) will progress shm ep and write + * completion to efa. Use ofi_cq_read_entries to get the number of + * shm completions without progressing efa ep again. + */ + ret = ofi_cq_read_entries(&cq->util_cq, buf, count, src_addr); - if (ret > 0) - goto out; + if (ret > 0) + goto out; + } ret = ofi_cq_readfrom(&cq->util_cq.cq_fid, buf, count, src_addr); diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 4705426cc63..6aa9328010f 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -85,7 +85,6 @@ struct efa_rdm_ep { enum ibv_cq_ex_type ibv_cq_ex_type; /* shm provider fid */ - bool use_shm_for_tx; struct fid_ep *shm_ep; /* diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 43f08ac548a..2d6f35f26e9 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -888,41 +888,97 @@ void efa_rdm_ep_set_extra_info(struct efa_rdm_ep *ep) } /** - * @brief set the "use_shm_for_tx" field of efa_rdm_ep - * The field is set based on various factors, including - * environment variables, user hints, user's fi_setopt() - * calls. - * This function should be called during call to fi_enable(), - * after user called fi_setopt(). - * - * @param[in,out] ep endpoint to set the field + * @brief Close all shm resources bound to the efa ep and domain. + * This function will do this cleanup as best effort. When there is failure + * to clean up shm resource, it will still move forward by setting the resource + * pointer to NULL so it won't be used later. + * + * @param efa_rdm_ep pointer to efa_rdm_ep. */ -static -void efa_rdm_ep_set_use_shm_for_tx(struct efa_rdm_ep *ep) +static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) { - if (!efa_rdm_ep_domain(ep)->shm_domain) { - ep->use_shm_for_tx = false; - return; + int ret; + struct efa_domain *efa_domain; + struct efa_av *efa_av; + struct efa_rdm_cq *efa_rdm_cq; + + if (efa_rdm_ep->shm_ep) { + ret = fi_close(&efa_rdm_ep->shm_ep->fid); + if (ret) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm ep\n"); + efa_rdm_ep->shm_ep = NULL; } - assert(ep->user_info); + efa_av = efa_rdm_ep->base_ep.av; + if (efa_av->shm_rdm_av) { + ret = fi_close(&efa_av->shm_rdm_av->fid); + if (ret) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm av\n"); + efa_av->shm_rdm_av = NULL; + } - /* App provided hints supercede environmental variables. - * - * Using the shm provider comes with some overheads, so avoid - * initializing the provider if the app provides a hint that it does not - * require node-local communication. We can still loopback over the EFA - * device in cases where the app violates the hint and continues - * communicating with node-local peers. - * - * aws-ofi-nccl relies on this feature. + efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, util_cq); + if (efa_rdm_cq->shm_cq) { + ret = fi_close(&efa_rdm_cq->shm_cq->fid); + if (ret) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm cq\n"); + efa_rdm_cq->shm_cq = NULL; + } + + efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq); + if (efa_rdm_cq->shm_cq) { + ret = fi_close(&efa_rdm_cq->shm_cq->fid); + if (ret) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm cq\n"); + efa_rdm_cq->shm_cq = NULL; + } + + efa_domain = efa_rdm_ep_domain(efa_rdm_ep); + + if (efa_domain->shm_domain) { + ret = fi_close(&efa_domain->shm_domain->fid); + if (ret) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm domain\n"); + efa_domain->shm_domain = NULL; + } + + if (efa_domain->fabric->shm_fabric) { + ret = fi_close(&efa_domain->fabric->shm_fabric->fid); + if (ret) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm fabric\n"); + efa_domain->fabric->shm_fabric = NULL; + } + + if (efa_domain->shm_info) { + fi_freeinfo(efa_domain->shm_info); + efa_domain->shm_info = NULL; + } +} + +/** + * @brief update the shm resources based on the + * the current ep status. When cuda_api_permitted + * is set as false via fi_setopt, shm should be + * shut down. This function must be called inside + * fi_enable which is called after fi_setopt. + * + * @param[in,out] ep efa_rdm_ep + */ +static +void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) +{ + bool use_shm; + + /* + * when efa_env.enable_shm_transfer is false + * , shm resources won't be created. */ - if ((ep->user_info->caps & FI_REMOTE_COMM) - /* but not local communication */ - && !(ep->user_info->caps & FI_LOCAL_COMM)) { - ep->use_shm_for_tx = false; + if (!efa_rdm_ep_domain(ep)->shm_domain) return; - } + + use_shm = true; + + assert(ep->user_info); /* * shm provider must make cuda calls to transfer cuda memory. @@ -935,12 +991,11 @@ void efa_rdm_ep_set_use_shm_for_tx(struct efa_rdm_ep *ep) if ((ep->user_info->caps & FI_HMEM) && hmem_ops[FI_HMEM_CUDA].initialized && !ep->cuda_api_permitted) { - ep->use_shm_for_tx = false; - return; + use_shm = false; } - ep->use_shm_for_tx = efa_env.enable_shm_transfer; - return; + if (!use_shm) + efa_rdm_ep_close_shm_resources(ep); } @@ -987,7 +1042,7 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) EFA_WARN(FI_LOG_EP_CTRL, "libfabric %s efa endpoint created! address: %s\n", fi_tostr("1", FI_TYPE_VERSION), ep_addr_str); - efa_rdm_ep_set_use_shm_for_tx(ep); + efa_rdm_ep_update_shm(ep); /* Enable shm provider endpoint & post recv buff. * Once core ep enabled, 18 bytes efa_addr (16 bytes raw + 2 bytes qpn) is set. diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index e89c0ffacde..292becbbda2 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -253,7 +253,7 @@ ssize_t efa_rdm_msg_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, peer = efa_rdm_ep_get_peer(efa_rdm_ep, msg->addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { shm_msg = (struct fi_msg *)msg; if (msg->desc) { efa_desc = msg->desc; @@ -292,7 +292,7 @@ ssize_t efa_rdm_msg_sendv(struct fid_ep *ep, const struct iovec *iov, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(count, desc, shm_desc); return fi_sendv(efa_rdm_ep->shm_ep, iov, shm_desc, count, peer->shm_fiaddr, context); @@ -320,7 +320,7 @@ ssize_t efa_rdm_msg_send(struct fid_ep *ep, const void *buf, size_t len, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(1, &desc, shm_desc); return fi_send(efa_rdm_ep->shm_ep, buf, len, desc? shm_desc[0] : NULL, peer->shm_fiaddr, context); @@ -351,7 +351,7 @@ ssize_t efa_rdm_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(1, &desc, shm_desc); return fi_senddata(efa_rdm_ep->shm_ep, buf, len, desc? shm_desc[0] : NULL, data, peer->shm_fiaddr, context); @@ -382,7 +382,7 @@ ssize_t efa_rdm_msg_inject(struct fid_ep *ep, const void *buf, size_t len, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { return fi_inject(efa_rdm_ep->shm_ep, buf, len, peer->shm_fiaddr); } @@ -413,7 +413,7 @@ ssize_t efa_rdm_msg_injectdata(struct fid_ep *ep, const void *buf, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { return fi_injectdata(efa_rdm_ep->shm_ep, buf, len, data, peer->shm_fiaddr); } @@ -451,7 +451,7 @@ ssize_t efa_rdm_msg_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged * peer = efa_rdm_ep_get_peer(efa_rdm_ep, tmsg->addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { shm_tmsg = (struct fi_msg_tagged *)tmsg; if (tmsg->desc) { efa_desc = tmsg->desc; @@ -491,7 +491,7 @@ ssize_t efa_rdm_msg_tsendv(struct fid_ep *ep_fid, const struct iovec *iov, return ret; assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(count, desc, shm_desc); return fi_tsendv(efa_rdm_ep->shm_ep, iov, desc? shm_desc : NULL, count, peer->shm_fiaddr, tag, context); @@ -526,7 +526,7 @@ ssize_t efa_rdm_msg_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(1, &desc, shm_desc); return fi_tsend(efa_rdm_ep->shm_ep, buf, len, desc? shm_desc[0] : NULL, peer->shm_fiaddr, tag, context); @@ -558,7 +558,7 @@ ssize_t efa_rdm_msg_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(1, &desc, shm_desc); return fi_tsenddata(efa_rdm_ep->shm_ep, buf, len, desc? shm_desc[0] : NULL, data, peer->shm_fiaddr, tag, context); @@ -589,7 +589,7 @@ ssize_t efa_rdm_msg_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { return fi_tinject(efa_rdm_ep->shm_ep, buf, len, peer->shm_fiaddr, tag); } @@ -619,7 +619,7 @@ ssize_t efa_rdm_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t l peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { return fi_tinjectdata(efa_rdm_ep->shm_ep, buf, len, data, peer->shm_fiaddr, tag); } diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index bae945ba150..cbe8e7f0b2c 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -147,7 +147,7 @@ ssize_t efa_rdm_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uin goto out; } - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { tmp_addr = msg->addr; tmp_desc = msg->desc; msg_clone = (struct fi_msg_rma *)msg; @@ -246,7 +246,7 @@ ssize_t efa_rdm_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **des peer = efa_rdm_ep_get_peer(efa_rdm_ep, src_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(iov_count, desc, shm_desc); @@ -290,7 +290,7 @@ ssize_t efa_rdm_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, peer = efa_rdm_ep_get_peer(efa_rdm_ep, src_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(1, &desc, shm_desc); @@ -468,7 +468,7 @@ ssize_t efa_rdm_rma_writemsg(struct fid_ep *ep, goto out; } - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { tmp_addr = msg->addr; tmp_desc = msg->desc; msg_clone = (struct fi_msg_rma *)msg; @@ -526,7 +526,7 @@ ssize_t efa_rdm_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **de peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(iov_count, desc, shm_desc); return fi_writev(efa_rdm_ep->shm_ep, iov, desc? shm_desc : NULL, iov_count, peer->shm_fiaddr, addr, key, context); @@ -569,7 +569,7 @@ ssize_t efa_rdm_rma_write(struct fid_ep *ep, const void *buf, size_t len, void * peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(1, &desc, shm_desc); return fi_write(efa_rdm_ep->shm_ep, buf, len, desc? shm_desc[0] : NULL, peer->shm_fiaddr, addr, key, context); @@ -599,7 +599,7 @@ ssize_t efa_rdm_rma_writedata(struct fid_ep *ep, const void *buf, size_t len, peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); - if (peer->is_local && efa_rdm_ep->use_shm_for_tx) { + if (peer->is_local && efa_rdm_ep->shm_ep) { if (desc) efa_rdm_get_desc_for_shm(1, &desc, shm_desc); return fi_writedata(efa_rdm_ep->shm_ep, buf, len, desc? shm_desc[0] : NULL, data, peer->shm_fiaddr, addr, key, context); diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 7eadb008aee..7b03fe75256 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -105,10 +105,9 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, efa_rdm_ep->host_id = local_host_id; ibv_qpx = efa_rdm_ep->base_ep.qp->ibv_qp_ex; ibv_cqx = efa_rdm_ep->ibv_cq_ex; - /* set use_shm_for_tx to false to force efa_rdm_ep to use efa device to send, - * which means use EFA device to send. - */ - efa_rdm_ep->use_shm_for_tx = false; + /* close shm_ep to force efa_rdm_ep to use efa device to send */ + fi_close(&efa_rdm_ep->shm_ep->fid); + efa_rdm_ep->shm_ep = NULL; ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 988c49b28aa..398cecc0044 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -104,6 +104,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_pke *pkt_entry; uint64_t actual_peer_host_id = UINT64_MAX; + int ret; g_efa_unit_test_mocks.local_host_id = local_host_id; g_efa_unit_test_mocks.peer_host_id = peer_host_id; @@ -114,7 +115,10 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->host_id = g_efa_unit_test_mocks.local_host_id; - efa_rdm_ep->use_shm_for_tx = false; + /* close shm_ep to force efa_rdm_ep to use efa device to send */ + ret = fi_close(&efa_rdm_ep->shm_ep->fid); + assert_int_equal(ret, 0); + efa_rdm_ep->shm_ep = NULL; /* Create and register a fake peer */ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); @@ -368,7 +372,10 @@ void test_efa_rdm_ep_dc_atomic_error_handling(struct efa_resource **state) msg.op = FI_SUM; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_ep->use_shm_for_tx = false; + /* close shm_ep to force efa_rdm_ep to use efa device to send */ + err = fi_close(&efa_rdm_ep->shm_ep->fid); + assert_int_equal(err, 0); + efa_rdm_ep->shm_ep = NULL; /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC diff --git a/prov/efa/test/efa_unit_test_rnr.c b/prov/efa/test/efa_unit_test_rnr.c index 5060a7c07ee..abf504f870d 100644 --- a/prov/efa/test/efa_unit_test_rnr.c +++ b/prov/efa/test/efa_unit_test_rnr.c @@ -37,7 +37,10 @@ void test_efa_rnr_queue_and_resend(struct efa_resource **state) efa_rdm_ep->base_ep.qp->ibv_qp_ex->wr_complete = &efa_mock_ibv_wr_complete_no_op; assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - efa_rdm_ep->use_shm_for_tx = false; + /* close shm_ep to force efa_rdm_ep to use efa device to send */ + ret = fi_close(&efa_rdm_ep->shm_ep->fid); + assert_int_equal(ret, 0); + efa_rdm_ep->shm_ep = NULL; ret = fi_send(resource->ep, send_buff.buff, send_buff.size, fi_mr_desc(send_buff.mr), peer_addr, NULL /* context */); assert_int_equal(ret, 0); assert_false(dlist_empty(&efa_rdm_ep->txe_list));