From d55303134d26611f19d1cacfa5ba719869b92e98 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Wed, 17 May 2023 10:01:36 -0700 Subject: [PATCH] prov/rxm: Add option to auto detect hmem iface of user buffers A new option FI_OFI_RXM_DETECT_HMEM_IFACE is added. With the option turned on, RxM tries to detect the HMEM iface of user buffers if no mr_desc is supplied. This allows successful copy between user buffers located in device memory and internal bounce buffers, as well as registering such user buffer in RxM Rendezvous protocol. Note that this doesn't allow such buffers to be used in the RxM direct send mode if the core provider requires FI_MR_HMEM because no memory registration is introduced by the option. Signed-off-by: Jianxin Xiong --- man/fi_rxm.7.md | 6 ++++++ prov/rxm/src/rxm.h | 18 +++++++++++++++--- prov/rxm/src/rxm_atomic.c | 14 ++++++++------ prov/rxm/src/rxm_cq.c | 22 +++++++++++++--------- prov/rxm/src/rxm_domain.c | 18 +++++++++++++++--- prov/rxm/src/rxm_init.c | 8 ++++++++ prov/rxm/src/rxm_msg.c | 4 ++-- 7 files changed, 67 insertions(+), 23 deletions(-) diff --git a/man/fi_rxm.7.md b/man/fi_rxm.7.md index f5ae80716c5..cc163f08cbb 100644 --- a/man/fi_rxm.7.md +++ b/man/fi_rxm.7.md @@ -175,6 +175,12 @@ with (default: 256). consecutively read across progress calls without checking to see if the CM progress interval has been reached (default: 128) +*FI_OFI_RXM_DETECT_HMEM_IFACE* +: Set this to 1 to allow automatic detection of HMEM iface of user buffers + when such information is not supplied. This feature allows such buffers be + copied or registered (e.g. in Rendezvous) internally by RxM. Note that no + extra memory registration is performed with this option. (default: false) + # Tuning ## Bandwidth diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 1164e72cbf0..bf53299ce1f 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -205,6 +205,7 @@ extern size_t rxm_cq_eq_fairness; extern int rxm_passthru; extern int force_auto_progress; extern int rxm_use_write_rndv; +extern int rxm_detect_hmem_iface; extern enum fi_wait_obj def_wait_obj, def_tcp_wait_obj; struct rxm_ep; @@ -309,11 +310,22 @@ struct rxm_mr { }; static inline enum fi_hmem_iface -rxm_mr_desc_to_hmem_iface_dev(void **desc, size_t count, uint64_t *device) +rxm_iov_desc_to_hmem_iface_dev(const struct iovec *iov, void **desc, + size_t count, uint64_t *device) { - if (!count || !desc || !desc[0]) { + enum fi_hmem_iface iface = FI_HMEM_SYSTEM; + + if (!count) { *device = 0; - return FI_HMEM_SYSTEM; + return iface; + } + + if (!desc || !desc[0]) { + if (rxm_detect_hmem_iface) + iface = ofi_get_hmem_iface(iov[0].iov_base, device, NULL); + else + *device = 0; + return iface; } *device = ((struct rxm_mr *) desc[0])->device; diff --git a/prov/rxm/src/rxm_atomic.c b/prov/rxm/src/rxm_atomic.c index 9fe0e1f8b0d..2c9711910d8 100644 --- a/prov/rxm/src/rxm_atomic.c +++ b/prov/rxm/src/rxm_atomic.c @@ -124,9 +124,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, datatype_sz); buf_len = ofi_total_iov_len(buf_iov, msg->iov_count); - buf_iface = rxm_mr_desc_to_hmem_iface_dev(msg->desc, - msg->iov_count, - &buf_device); + buf_iface = rxm_iov_desc_to_hmem_iface_dev(buf_iov, + msg->desc, + msg->iov_count, + &buf_device); } if (op == ofi_op_atomic_compare) { @@ -136,9 +137,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count); assert(buf_len == cmp_len); - cmp_iface = rxm_mr_desc_to_hmem_iface_dev(compare_desc, - compare_iov_count, - &cmp_device); + cmp_iface = rxm_iov_desc_to_hmem_iface_dev(cmp_iov, + compare_desc, + compare_iov_count, + &cmp_device); } data_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr); diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index 2b4e169ba6c..8ba24f9a544 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -378,9 +378,10 @@ static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done) uint64_t device; ssize_t done_len; - iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, - &device); + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, + &device); done_len = ofi_copy_to_hmem_iov(iface, device, rx_buf->recv_entry->rxm_iov.iov, @@ -629,6 +630,7 @@ void rxm_handle_eager(struct rxm_rx_buf *rx_buf) rx_buf->recv_entry->rxm_iov.desc, rx_buf->data, rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov, rx_buf->recv_entry->rxm_iov.count, 0); + assert((size_t) done_len == rx_buf->pkt.hdr.size); rxm_finish_recv(rx_buf, done_len); @@ -640,9 +642,10 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) uint64_t device; ssize_t done_len; - iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, - &device); + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, + &device); done_len = ofi_copy_to_hmem_iov(iface, device, rx_buf->recv_entry->rxm_iov.iov, @@ -1247,9 +1250,10 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, " msg_id: 0x%" PRIx64 "\n", rx_buf->pkt.hdr.op, rx_buf->pkt.ctrl_hdr.msg_id); - iface = rxm_mr_desc_to_hmem_iface_dev(tx_buf->atomic_result.desc, - tx_buf->atomic_result.count, - &device); + iface = rxm_iov_desc_to_hmem_iface_dev(tx_buf->atomic_result.iov, + tx_buf->atomic_result.desc, + tx_buf->atomic_result.count, + &device); assert(!(rx_buf->comp_flags & ~(FI_RECV | FI_REMOTE_CQ_DATA))); diff --git a/prov/rxm/src/rxm_domain.c b/prov/rxm/src/rxm_domain.c index a5c0233a17b..055fca16bea 100644 --- a/prov/rxm/src/rxm_domain.c +++ b/prov/rxm/src/rxm_domain.c @@ -474,12 +474,24 @@ int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf, size_t len, uint64_t acs, uint64_t flags, struct fid_mr **mr) { int ret, tries = 0; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = len, + }; + struct fi_mr_attr attr = { + .mr_iov = &iov, + .iov_count = 1, + .access = acs, + .iface = FI_HMEM_SYSTEM, + }; + + if (rxm_detect_hmem_iface) + attr.iface = ofi_get_hmem_iface(buf, &attr.device.reserved, NULL); /* If we can't get a key within 1024 tries, give up */ do { - ret = fi_mr_reg(rxm_domain->msg_domain, buf, len, acs, 0, - rxm_domain->mr_key++ | (1UL << 31), - flags, mr, NULL); + attr.requested_key = rxm_domain->mr_key++ | (1UL << 31); + ret = fi_mr_regattr(rxm_domain->msg_domain, &attr, flags, mr); } while (ret == -FI_ENOKEY && tries++ < 1024); return ret; diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index 3ca3c22593f..a29c530b0d3 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -58,6 +58,7 @@ size_t rxm_packet_size; int rxm_passthru = 0; /* disable by default, need to analyze performance */ int force_auto_progress; int rxm_use_write_rndv; +int rxm_detect_hmem_iface; enum fi_wait_obj def_wait_obj = FI_WAIT_FD, def_tcp_wait_obj = FI_WAIT_UNSPEC; char *rxm_proto_state_str[] = { @@ -700,6 +701,11 @@ RXM_INI "to the tcp provider, depending on the capabilities " "requested by the application."); + fi_param_define(&rxm_prov, "detect_hmem_iface", FI_PARAM_BOOL, + "Detect iface for user buffers with NULL desc passed " + "in. This allows such buffers be copied or registered " + "internally by RxM. (default: false)."); + /* passthru supported disabled - to re-enable would need to fix call to * fi_cq_read to pass in the correct data structure. However, passthru * will not be needed at all with in-work tcp changes. @@ -725,6 +731,8 @@ RXM_INI "(FI_OFI_RXM_DATA_AUTO_PROGRESS = 1), domain threading " "level would be set to FI_THREAD_SAFE\n"); + fi_param_get_bool(&rxm_prov, "detect_hmem_iface", &rxm_detect_hmem_iface); + #if HAVE_RXM_DL ofi_mem_init(); ofi_hmem_init(); diff --git a/prov/rxm/src/rxm_msg.c b/prov/rxm/src/rxm_msg.c index 39af82972bc..fdd036e7d32 100644 --- a/prov/rxm/src/rxm_msg.c +++ b/prov/rxm/src/rxm_msg.c @@ -473,7 +473,7 @@ rxm_send_sar(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, ssize_t ret; assert(segs_cnt >= 2); - iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device); + iface = rxm_iov_desc_to_hmem_iface_dev(iov, desc, count, &device); first_tx_buf = rxm_init_segment(rxm_ep, rxm_conn, context, data_len, rxm_buffer_size, @@ -709,7 +709,7 @@ rxm_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, (data_len > rxm_ep->rxm_info->tx_attr->inject_size)) || (data_len <= rxm_ep->rxm_info->tx_attr->inject_size)); - iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device); + iface = rxm_iov_desc_to_hmem_iface_dev(iov, desc, count, &device); if (iface == FI_HMEM_ZE || iface == FI_HMEM_SYNAPSEAI) goto rndv_send;