Skip to content

Commit

Permalink
[v2.0.x] prov/efa: Skip rx pkt refill under certain threshold
Browse files Browse the repository at this point in the history
Libfabric currently refill the rx pkt pool in every cq read when there are >0 pkts to post,
which makes it have chance to post ibv_recv 1-by-1 if there is only 1 pkt to post per cq read.
Such 1-by-1 post is less performant than having a batch post once.

This patch improves this strategy by introducing a threshold for the refilling. When
When the number of internal rx pkts to post is lower than this threshold, the refill will be skipped.

Also introduced FI_EFA_INTERNAL_RX_REFILL_THRESHOLD that allows tuning this parameter.

Signed-off-by: Shi Jin <sjina@amazon.com>
(cherry picked from commit a149f51)
  • Loading branch information
shijin-aws authored and j-xiong committed Dec 11, 2024
1 parent 6deda39 commit 62dbef6
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 1 deletion.
5 changes: 5 additions & 0 deletions man/fi_efa.7.md
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,11 @@ for details.
: Use device's unsolicited write recv functionality when it's available. (Default: 1).
Setting this environment variable to 0 can disable this feature.

*FI_EFA_INTERNAL_RX_REFILL_THRESHOLD*
: The threshold that EFA provider will refill the internal rx pkt pool. (Default: 8).
When the number of internal rx pkts to post is lower than this threshold,
the refill will be skipped.

# SEE ALSO

[`fabric`(7)](fabric.7.html),
Expand Down
4 changes: 4 additions & 0 deletions prov/efa/src/efa_env.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct efa_env efa_env = {
.use_sm2 = false,
.huge_page_setting = EFA_ENV_HUGE_PAGE_UNSPEC,
.use_unsolicited_write_recv = 1,
.internal_rx_refill_threshold = 8,
};

/**
Expand Down Expand Up @@ -132,6 +133,7 @@ void efa_env_param_get(void)
&efa_mr_max_cached_size);
fi_param_get_size_t(&efa_prov, "tx_size", &efa_env.tx_size);
fi_param_get_size_t(&efa_prov, "rx_size", &efa_env.rx_size);
fi_param_get_size_t(&efa_prov, "internal_rx_refill_threshold", &efa_env.internal_rx_refill_threshold);
fi_param_get_bool(&efa_prov, "rx_copy_unexp",
&efa_env.rx_copy_unexp);
fi_param_get_bool(&efa_prov, "rx_copy_ooo",
Expand Down Expand Up @@ -232,6 +234,8 @@ void efa_env_define()
"will use huge page unless FI_EFA_FORK_SAFE is set to 1/on/true.");
fi_param_define(&efa_prov, "use_unsolicited_write_recv", FI_PARAM_BOOL,
"Use device's unsolicited write recv functionality when it's available. (Default: true)");
fi_param_define(&efa_prov, "internal_rx_refill_threshold", FI_PARAM_SIZE_T,
"The threshold that EFA provider will refill the internal rx pkt pool. (Default: %zu)", efa_env.internal_rx_refill_threshold);
}


Expand Down
6 changes: 6 additions & 0 deletions prov/efa/src/efa_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ struct efa_env {
int use_sm2;
enum efa_env_huge_page_setting huge_page_setting;
int use_unsolicited_write_recv;
/**
* The threshold that EFA provider will refill the internal rx pkt pool.
* When the number of internal rx pkts to post is lower than this threshold,
* the refill will be skipped.
*/
size_t internal_rx_refill_threshold;
};

extern struct efa_env efa_env;
Expand Down
2 changes: 2 additions & 0 deletions prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep)

void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep);

int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep);

/**
* @brief return whether this endpoint should write error cq entry for RNR.
*
Expand Down
6 changes: 5 additions & 1 deletion prov/efa/src/rdm/efa_rdm_ep_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,11 @@ int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep)
{
int i, err;

if (ep->efa_rx_pkts_to_post == 0)
/**
* When efa_env.internal_rx_refill_threshold > efa_rdm_ep_get_rx_pool_size(ep),
* we should always refill when the pool is empty.
*/
if (ep->efa_rx_pkts_to_post < MIN(efa_env.internal_rx_refill_threshold, efa_rdm_ep_get_rx_pool_size(ep)))
return 0;

assert(ep->efa_rx_pkts_to_post + ep->efa_rx_pkts_posted <= ep->efa_max_outstanding_rx_ops);
Expand Down
80 changes: 80 additions & 0 deletions prov/efa/test/efa_unit_test_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -1219,3 +1219,83 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res

free(pkt_entry_vec);
}

static
void test_efa_rdm_ep_rx_refill_impl(struct efa_resource **state, int threshold, int rx_size)
{
struct efa_resource *resource = *state;
struct efa_rdm_ep *efa_rdm_ep;
struct efa_rdm_pke *pkt_entry;
int i;
size_t threshold_orig;

if (threshold < 4 || rx_size < 4) {
fprintf(stderr, "Too small threshold or rx_size for this test\n");
fail();
}

threshold_orig = efa_env.internal_rx_refill_threshold;

efa_env.internal_rx_refill_threshold = threshold;

resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM);
assert_non_null(resource->hints);
resource->hints->rx_attr->size = rx_size;
efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14),
resource->hints, true, true);

efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
assert_int_equal(efa_rdm_ep_get_rx_pool_size(efa_rdm_ep), rx_size);

/* Grow the rx pool and post rx pkts */
efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep);
assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, efa_rdm_ep_get_rx_pool_size(efa_rdm_ep));

assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0);
for (i = 0; i < 4; i++) {
pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i);
assert_non_null(pkt_entry);
efa_rdm_pke_release_rx(pkt_entry);
}
assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4);

efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep);

/**
* efa_rx_pkts_to_post < FI_EFA_RX_REFILL_THRESHOLD
* pkts should NOT be refilled
*/
assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4);
assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size);

/* releasing more pkts to reach the threshold or rx_size*/
for (i = 4; i < MIN(rx_size, threshold); i++) {
pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i);
assert_non_null(pkt_entry);
efa_rdm_pke_release_rx(pkt_entry);
}

assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, i);

efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep);

/**
* efa_rx_pkts_to_post == min(FI_EFA_RX_REFILL_THRESHOLD, FI_EFA_RX_SIZE)
* pkts should be refilled
*/
assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0);
assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size + i);

/* recover the original value */
efa_env.internal_rx_refill_threshold = threshold_orig;
}

void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(struct efa_resource **state)
{
test_efa_rdm_ep_rx_refill_impl(state, 8, 64);
}

void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(struct efa_resource **state)
{
test_efa_rdm_ep_rx_refill_impl(state, 128, 64);
}
2 changes: 2 additions & 0 deletions prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ int main(void)
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_eagain, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
2 changes: 2 additions & 0 deletions prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ void test_efa_rdm_ep_close_discard_posted_recv();
void test_efa_rdm_ep_zcpy_recv_cancel();
void test_efa_rdm_ep_zcpy_recv_eagain();
void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion();
void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size();
void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size();
void test_dgram_cq_read_empty_cq();
void test_ibv_cq_ex_read_empty_cq();
void test_ibv_cq_ex_read_failed_poll();
Expand Down

0 comments on commit 62dbef6

Please sign in to comment.