-
Notifications
You must be signed in to change notification settings - Fork 389
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
prov/efa: Enable inject rdma write #9512
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -1451,14 +1451,14 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) | |||||||||||||||||||||||||||||||||||
{ | ||||||||||||||||||||||||||||||||||||
int err; | ||||||||||||||||||||||||||||||||||||
int iov_idx = 0, rma_iov_idx = 0; | ||||||||||||||||||||||||||||||||||||
ssize_t copied; | ||||||||||||||||||||||||||||||||||||
size_t iov_offset = 0, rma_iov_offset = 0; | ||||||||||||||||||||||||||||||||||||
size_t write_once_len, max_write_once_len; | ||||||||||||||||||||||||||||||||||||
struct efa_rdm_ep *ep; | ||||||||||||||||||||||||||||||||||||
struct efa_rdm_pke *pkt_entry; | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
assert(ope->iov_count > 0); | ||||||||||||||||||||||||||||||||||||
assert(ope->rma_iov_count > 0); | ||||||||||||||||||||||||||||||||||||
efa_rdm_ope_try_fill_desc(ope, 0, FI_WRITE); | ||||||||||||||||||||||||||||||||||||
ep = ope->ep; | ||||||||||||||||||||||||||||||||||||
if (ope->bytes_write_total_len == 0) { | ||||||||||||||||||||||||||||||||||||
/* According to libfabric document | ||||||||||||||||||||||||||||||||||||
|
@@ -1473,15 +1473,22 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) | |||||||||||||||||||||||||||||||||||
if (OFI_UNLIKELY(!pkt_entry)) | ||||||||||||||||||||||||||||||||||||
return -FI_EAGAIN; | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
/* Provide the registered bounce buffer and its desc to rdma-core. | ||||||||||||||||||||||||||||||||||||
* The user provided buffer/desc will not be used for 0 byte writes. | ||||||||||||||||||||||||||||||||||||
* This allows the user to pass NULL for buff/desc. | ||||||||||||||||||||||||||||||||||||
*/ | ||||||||||||||||||||||||||||||||||||
efa_rdm_pke_init_write_context( | ||||||||||||||||||||||||||||||||||||
pkt_entry, ope, ope->iov[0].iov_base, 0, ope->desc[0], | ||||||||||||||||||||||||||||||||||||
pkt_entry, ope, pkt_entry->wiredata, 0, fi_mr_desc(pkt_entry->mr), | ||||||||||||||||||||||||||||||||||||
ope->rma_iov[0].addr, ope->rma_iov[0].key); | ||||||||||||||||||||||||||||||||||||
err = efa_rdm_pke_write(pkt_entry); | ||||||||||||||||||||||||||||||||||||
if (err) | ||||||||||||||||||||||||||||||||||||
efa_rdm_pke_release_tx(pkt_entry); | ||||||||||||||||||||||||||||||||||||
return err; | ||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
if (!(ope->fi_flags & FI_INJECT)) | ||||||||||||||||||||||||||||||||||||
efa_rdm_ope_try_fill_desc(ope, 0, FI_WRITE); | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
assert(ope->bytes_write_submitted < ope->bytes_write_total_len); | ||||||||||||||||||||||||||||||||||||
max_write_once_len = MIN(efa_env.efa_write_segment_size, efa_rdm_ep_domain(ep)->device->max_rdma_size); | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
|
@@ -1513,7 +1520,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) | |||||||||||||||||||||||||||||||||||
if (ep->efa_outstanding_tx_ops == ep->efa_max_outstanding_tx_ops) | ||||||||||||||||||||||||||||||||||||
return -FI_EAGAIN; | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
if (!ope->desc[iov_idx]) { | ||||||||||||||||||||||||||||||||||||
if (!ope->desc[iov_idx] && !(ope->fi_flags & FI_INJECT)) { | ||||||||||||||||||||||||||||||||||||
/* efa_rdm_ope_try_fill_desc() did not fill the desc, | ||||||||||||||||||||||||||||||||||||
* which means memory registration failed. | ||||||||||||||||||||||||||||||||||||
* return -FI_EAGAIN here will cause user to run progress | ||||||||||||||||||||||||||||||||||||
|
@@ -1527,6 +1534,16 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) | |||||||||||||||||||||||||||||||||||
if (OFI_UNLIKELY(!pkt_entry)) | ||||||||||||||||||||||||||||||||||||
return -FI_EAGAIN; | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
if (ope->fi_flags & FI_INJECT) { | ||||||||||||||||||||||||||||||||||||
assert(ope->iov_count == 1); | ||||||||||||||||||||||||||||||||||||
assert(ope->total_len <= ep->inject_size); | ||||||||||||||||||||||||||||||||||||
copied = ofi_copy_from_hmem_iov(pkt_entry->wiredata + sizeof(struct efa_rdm_rma_context_pkt), | ||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. line is too long There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. EFA doesn't enforce line length. |
||||||||||||||||||||||||||||||||||||
ope->total_len, FI_HMEM_SYSTEM, 0, ope->iov, ope->iov_count, 0); | ||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I am wrong here, it should be system as there is no MR passed in for inject |
||||||||||||||||||||||||||||||||||||
assert(copied == ope->total_len); | ||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't be an assert. It should be warning and return error when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. libfabric/prov/efa/src/rdm/efa_rdm_pke_utils.c Lines 123 to 139 in 387a733
|
||||||||||||||||||||||||||||||||||||
ope->desc[0] = fi_mr_desc(pkt_entry->mr); | ||||||||||||||||||||||||||||||||||||
ope->iov[0].iov_base = pkt_entry->wiredata + sizeof(struct efa_rdm_rma_context_pkt); | ||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
write_once_len = MIN(ope->iov[iov_idx].iov_len - iov_offset, | ||||||||||||||||||||||||||||||||||||
ope->rma_iov[rma_iov_idx].len - rma_iov_offset); | ||||||||||||||||||||||||||||||||||||
write_once_len = MIN(write_once_len, max_write_once_len); | ||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You copy the data from user buffer to the packet entry but the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hack this by:
and that works b/c:
passes in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The txe has a separate libfabric/prov/efa/src/rdm/efa_rdm_ope.c Lines 85 to 86 in d43c492
|
||||||||||||||||||||||||||||||||||||
|
@@ -1589,10 +1606,10 @@ int efa_rdm_ope_post_remote_read_or_queue(struct efa_rdm_ope *ope) | |||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||
/** | ||||||||||||||||||||||||||||||||||||
* @brief post a local read request, queue it if necessary | ||||||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||||||
* a local read request is posted to copy data from a packet | ||||||||||||||||||||||||||||||||||||
* entry to user posted receive buffer on device. | ||||||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||||||
* @param[in] rxe which has the receive buffer information | ||||||||||||||||||||||||||||||||||||
* @param[in] rx_data_offset offset of data in the receive buffer | ||||||||||||||||||||||||||||||||||||
* @param[in] pkt_entry which has the data | ||||||||||||||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reasoning for avoiding an MR reg call for zero byte write?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
efa_rdm_ope_try_fill_desc()
callsfi_mr_regv()
on the buffer. This line assumes that the user will pass a real registered buffer to do a 0 byte write. If you are doing a 0 byte write, then your buffer could be NULL, and your desc could be NULL b/c EFA shouldn't actually do anything with them. Thefi_mr_regv()
will segfault on NULL desc. Also segfaults for FI_INJECT which always gives a NULL desc.The reason this is in its own commit is you could hit this bug and segfault via fi_write() with 0 bytes and an invalid buffer in main. I wanted to be explicit about fixing this bug is separate to adding fi_inject_write().
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did you observe any performance impact for this commit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like a speedup usec/xfer 1.30 (before), .98 (after) for