diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index e133c79b553..d2ec4dd2e70 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -1451,6 +1451,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) { int err; int iov_idx = 0, rma_iov_idx = 0; + ssize_t copied; size_t iov_offset = 0, rma_iov_offset = 0; size_t write_once_len, max_write_once_len; struct efa_rdm_ep *ep; @@ -1458,7 +1459,6 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) assert(ope->iov_count > 0); assert(ope->rma_iov_count > 0); - efa_rdm_ope_try_fill_desc(ope, 0, FI_WRITE); ep = ope->ep; if (ope->bytes_write_total_len == 0) { /* According to libfabric document @@ -1473,8 +1473,12 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) if (OFI_UNLIKELY(!pkt_entry)) return -FI_EAGAIN; + /* Provide the registered bounce buffer and its desc to rdma-core. + * The user provided buffer/desc will not be used for 0 byte writes. + * This allows the user to pass NULL for buff/desc. + */ efa_rdm_pke_init_write_context( - pkt_entry, ope, ope->iov[0].iov_base, 0, ope->desc[0], + pkt_entry, ope, pkt_entry->wiredata, 0, fi_mr_desc(pkt_entry->mr), ope->rma_iov[0].addr, ope->rma_iov[0].key); err = efa_rdm_pke_write(pkt_entry); if (err) @@ -1482,6 +1486,9 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) return err; } + if (!(ope->fi_flags & FI_INJECT)) + efa_rdm_ope_try_fill_desc(ope, 0, FI_WRITE); + assert(ope->bytes_write_submitted < ope->bytes_write_total_len); max_write_once_len = MIN(efa_env.efa_write_segment_size, efa_rdm_ep_domain(ep)->device->max_rdma_size); @@ -1513,7 +1520,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) if (ep->efa_outstanding_tx_ops == ep->efa_max_outstanding_tx_ops) return -FI_EAGAIN; - if (!ope->desc[iov_idx]) { + if (!ope->desc[iov_idx] && !(ope->fi_flags & FI_INJECT)) { /* efa_rdm_ope_try_fill_desc() did not fill the desc, * which means memory registration failed. * return -FI_EAGAIN here will cause user to run progress @@ -1527,6 +1534,16 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) if (OFI_UNLIKELY(!pkt_entry)) return -FI_EAGAIN; + if (ope->fi_flags & FI_INJECT) { + assert(ope->iov_count == 1); + assert(ope->total_len <= ep->inject_size); + copied = ofi_copy_from_hmem_iov(pkt_entry->wiredata + sizeof(struct efa_rdm_rma_context_pkt), + ope->total_len, FI_HMEM_SYSTEM, 0, ope->iov, ope->iov_count, 0); + assert(copied == ope->total_len); + ope->desc[0] = fi_mr_desc(pkt_entry->mr); + ope->iov[0].iov_base = pkt_entry->wiredata + sizeof(struct efa_rdm_rma_context_pkt); + } + write_once_len = MIN(ope->iov[iov_idx].iov_len - iov_offset, ope->rma_iov[rma_iov_idx].len - rma_iov_offset); write_once_len = MIN(write_once_len, max_write_once_len); @@ -1589,10 +1606,10 @@ int efa_rdm_ope_post_remote_read_or_queue(struct efa_rdm_ope *ope) /** * @brief post a local read request, queue it if necessary - * + * * a local read request is posted to copy data from a packet * entry to user posted receive buffer on device. - * + * * @param[in] rxe which has the receive buffer information * @param[in] rx_data_offset offset of data in the receive buffer * @param[in] pkt_entry which has the data diff --git a/prov/efa/src/rdm/efa_rdm_pkt_type.c b/prov/efa/src/rdm/efa_rdm_pkt_type.c index b06483c477b..ce08e18b2aa 100644 --- a/prov/efa/src/rdm/efa_rdm_pkt_type.c +++ b/prov/efa/src/rdm/efa_rdm_pkt_type.c @@ -4,6 +4,7 @@ #include "efa_rdm_peer.h" #include "efa_rdm_protocol.h" #include "efa_rdm_pkt_type.h" +#include "efa_rdm_pke_nonreq.h" struct efa_rdm_pkt_type_req_info EFA_RDM_PKT_TYPE_REQ_INFO_VEC[] = { /* rtm header */ @@ -137,5 +138,8 @@ size_t efa_rdm_pkt_type_get_max_hdr_size(void) pkt_type += 1; } + /* Non-emulated (real) rdma inject write requires a header */ + max_hdr_size = MAX(max_hdr_size, sizeof(struct efa_rdm_rma_context_pkt)); + return max_hdr_size; } \ No newline at end of file diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index cbe8e7f0b2c..c1b46f4cf64 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -317,14 +317,6 @@ ssize_t efa_rdm_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, static inline bool efa_rdm_rma_should_write_using_rdma(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe, struct efa_rdm_peer *peer) { - /* - * RDMA_WRITE does not support FI_INJECT, because device may - * need to re-send data and FI_INJECT allows user to re-use - * these buffers immediately. - */ - if (txe->fi_flags & FI_INJECT) - return false; - /* * Because EFA is unordered and EFA iov descriptions can be more * expressive than the IBV sge's, we only implement