Skip to content

Commit

Permalink
prov/efa: Enable inject rdma write
Browse files Browse the repository at this point in the history
Turns off emulated inject rdma write, and enables real inject rdma
write. Inject rdma write is special for two reasons:

1. The user gets to re-use their immediately after we return
2. RDMA operations require MR's, and the inject write interfaces do not
   provide a memory descritpor

In order to handle both cases, the users data is copied into a
pre-registered bounce buffer. The bounce buffer is allocated to be the
sizeof(struct efa_rdm_pke) + ep->mtu_size, where the wire data starts at
ep->mtu_size. The RDMA operations use part of the wire data for a RDMA
header struct efa_rdm_rma_context_pkt.  This change needs to lower the
maximum inject size to never be more than mtu_size - sizeof(struct
efa_rdm_rma_context_pkt). I created a github issue for libfabric 2.0 API
to seperate inject sizes for send/recv, atomic, and rdma operations
(issue 9510).

Signed-off-by: Seth Zegelstein <szegel@amazon.com>
  • Loading branch information
a-szegel committed Nov 3, 2023
1 parent 5d39682 commit a09f91e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 11 deletions.
17 changes: 15 additions & 2 deletions prov/efa/src/rdm/efa_rdm_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -1451,6 +1451,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope)
{
int err;
int iov_idx = 0, rma_iov_idx = 0;
ssize_t copied;
size_t iov_offset = 0, rma_iov_offset = 0;
size_t write_once_len, max_write_once_len;
struct efa_rdm_ep *ep;
Expand Down Expand Up @@ -1482,7 +1483,9 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope)
return err;
}

efa_rdm_ope_try_fill_desc(ope, 0, FI_WRITE);
if (!(ope->fi_flags & FI_INJECT))
efa_rdm_ope_try_fill_desc(ope, 0, FI_WRITE);

assert(ope->bytes_write_submitted < ope->bytes_write_total_len);
max_write_once_len = MIN(efa_env.efa_write_segment_size, efa_rdm_ep_domain(ep)->device->max_rdma_size);

Expand Down Expand Up @@ -1514,7 +1517,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope)
if (ep->efa_outstanding_tx_ops == ep->efa_max_outstanding_tx_ops)
return -FI_EAGAIN;

if (!ope->desc[iov_idx]) {
if (!ope->desc[iov_idx] && !(ope->fi_flags & FI_INJECT)) {
/* efa_rdm_ope_try_fill_desc() did not fill the desc,
* which means memory registration failed.
* return -FI_EAGAIN here will cause user to run progress
Expand All @@ -1528,6 +1531,16 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope)
if (OFI_UNLIKELY(!pkt_entry))
return -FI_EAGAIN;

if (ope->fi_flags & FI_INJECT) {
assert(ope->iov_count == 1);
assert(ope->total_len <= ep->inject_size);
copied = ofi_copy_from_hmem_iov(pkt_entry->wiredata + sizeof(struct efa_rdm_rma_context_pkt),
ope->total_len, FI_HMEM_SYSTEM, 0, ope->iov, ope->iov_count, 0);
assert(copied == ope->total_len);
ope->desc[0] = fi_mr_desc(pkt_entry->mr);
ope->iov[0].iov_base = pkt_entry->wiredata + sizeof(struct efa_rdm_rma_context_pkt);
}

write_once_len = MIN(ope->iov[iov_idx].iov_len - iov_offset,
ope->rma_iov[rma_iov_idx].len - rma_iov_offset);
write_once_len = MIN(write_once_len, max_write_once_len);
Expand Down
6 changes: 5 additions & 1 deletion prov/efa/src/rdm/efa_rdm_pkt_type.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "efa_rdm_peer.h"
#include "efa_rdm_protocol.h"
#include "efa_rdm_pkt_type.h"
#include "efa_rdm_pke_nonreq.h"

struct efa_rdm_pkt_type_req_info EFA_RDM_PKT_TYPE_REQ_INFO_VEC[] = {
/* rtm header */
Expand Down Expand Up @@ -137,5 +138,8 @@ size_t efa_rdm_pkt_type_get_max_hdr_size(void)
pkt_type += 1;
}

/* Non-emulated (real) rdma inject write requires a header */
max_hdr_size = MAX(max_hdr_size, sizeof(struct efa_rdm_rma_context_pkt));

return max_hdr_size;
}
}
8 changes: 0 additions & 8 deletions prov/efa/src/rdm/efa_rdm_rma.c
Original file line number Diff line number Diff line change
Expand Up @@ -317,14 +317,6 @@ ssize_t efa_rdm_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc,
static inline
bool efa_rdm_rma_should_write_using_rdma(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe, struct efa_rdm_peer *peer)
{
/*
* RDMA_WRITE does not support FI_INJECT, because device may
* need to re-send data and FI_INJECT allows user to re-use
* these buffers immediately.
*/
if (txe->fi_flags & FI_INJECT)
return false;

/*
* Because EFA is unordered and EFA iov descriptions can be more
* expressive than the IBV sge's, we only implement
Expand Down

0 comments on commit a09f91e

Please sign in to comment.