Skip to content

Commit

Permalink
prov/efa: Fix a memory leak in local read
Browse files Browse the repository at this point in the history
txe should be released when efa_rdm_ope_post_remote_read_or_queue returns error.

Signed-off-by: Shi Jin <sjina@amazon.com>
  • Loading branch information
shijin-aws committed Aug 13, 2024
1 parent 556bb56 commit a78271c
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 3 deletions.
3 changes: 2 additions & 1 deletion prov/efa/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ prov_efa_test_efa_unit_test_LDFLAGS = $(cmocka_rpath) $(efa_LDFLAGS) $(cmocka_LD
-Wl,--wrap=ibv_is_fork_initialized \
-Wl,--wrap=efadv_query_device \
-Wl,--wrap=ofi_cudaMalloc \
-Wl,--wrap=ofi_copy_from_hmem_iov
-Wl,--wrap=ofi_copy_from_hmem_iov \
-Wl,--wrap=efa_rdm_pke_read

if HAVE_EFADV_CQ_EX
prov_efa_test_efa_unit_test_LDFLAGS += -Wl,--wrap=efadv_create_cq
Expand Down
4 changes: 3 additions & 1 deletion prov/efa/src/rdm/efa_rdm_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -1656,7 +1656,9 @@ int efa_rdm_rxe_post_local_read_or_queue(struct efa_rdm_ope *rxe,
txe->local_read_pkt_entry = pkt_entry;
err = efa_rdm_ope_post_remote_read_or_queue(txe);
/* The rx pkts are held until the local read completes */
if (txe->local_read_pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_RX_POOL && !err)
if (err)
efa_rdm_txe_release(txe);
else if (txe->local_read_pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_RX_POOL)
txe->ep->efa_rx_pkts_held++;

return err;
Expand Down
11 changes: 11 additions & 0 deletions prov/efa/test/efa_unit_test_mocks.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,11 @@ ssize_t efa_mock_ofi_copy_from_hmem_iov_inc_counter(void *dest, size_t size,
return __real_ofi_copy_from_hmem_iov(dest, size, hmem_iface, device, hmem_iov, hmem_iov_count, hmem_iov_offset);
}

int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope)
{
return mock();
}

struct efa_unit_test_mocks g_efa_unit_test_mocks = {
.local_host_id = 0,
.peer_host_id = 0,
Expand All @@ -207,6 +212,7 @@ struct efa_unit_test_mocks g_efa_unit_test_mocks = {
.ofi_cudaMalloc = __real_ofi_cudaMalloc,
#endif
.ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov,
.efa_rdm_pke_read = __real_efa_rdm_pke_read,
.ibv_is_fork_initialized = __real_ibv_is_fork_initialized,
#if HAVE_EFADV_QUERY_MR
.efadv_query_mr = __real_efadv_query_mr,
Expand Down Expand Up @@ -336,6 +342,11 @@ ssize_t __wrap_ofi_copy_from_hmem_iov(void *dest, size_t size,
return g_efa_unit_test_mocks.ofi_copy_from_hmem_iov(dest, size, hmem_iface, device, hmem_iov, hmem_iov_count, hmem_iov_offset);
}

int __wrap_efa_rdm_pke_read(struct efa_rdm_ope *ope)
{
return g_efa_unit_test_mocks.efa_rdm_pke_read(ope);
}

enum ibv_fork_status __wrap_ibv_is_fork_initialized(void)
{
return g_efa_unit_test_mocks.ibv_is_fork_initialized();
Expand Down
6 changes: 6 additions & 0 deletions prov/efa/test/efa_unit_test_mocks.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ ssize_t efa_mock_ofi_copy_from_hmem_iov_inc_counter(void *dest, size_t size,
const struct iovec *hmem_iov,
size_t hmem_iov_count, uint64_t hmem_iov_offset);

int __real_efa_rdm_pke_read(struct efa_rdm_ope *ope);

int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope);

struct efa_unit_test_mocks
{
uint64_t local_host_id;
Expand Down Expand Up @@ -112,6 +116,8 @@ struct efa_unit_test_mocks
const struct iovec *hmem_iov,
size_t hmem_iov_count, uint64_t hmem_iov_offset);

int (*efa_rdm_pke_read)(struct efa_rdm_ope *ope);

enum ibv_fork_status (*ibv_is_fork_initialized)(void);

#if HAVE_EFADV_QUERY_MR
Expand Down
60 changes: 59 additions & 1 deletion prov/efa/test/efa_unit_test_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -282,4 +282,62 @@ void test_efa_rdm_ope_post_write_0_byte(struct efa_resource **state)
efa_rdm_pke_release_tx((struct efa_rdm_pke *)g_ibv_submitted_wr_id_vec[0]);
mock_txe.ep->efa_outstanding_tx_ops = 0;
efa_unit_test_buff_destruct(&local_buff);
}
}

/**
* @brief efa_rdm_rxe_post_local_read_or_queue should call
* efa_rdm_pke_read.
* When efa_rdm_pke_read failed,
* make sure there is no txe leak in efa_rdm_rxe_post_local_read_or_queue
*
* @param[in] state struct efa_resource that is managed by the framework
*/
void test_efa_rdm_rxe_post_local_read_or_queue_cleanup_txe(struct efa_resource **state)
{
struct efa_rdm_ep *efa_rdm_ep;
struct efa_resource *resource = *state;
struct efa_rdm_pke *pkt_entry;
struct efa_rdm_ope *rxe;
struct efa_mr cuda_mr = {0};
int expected_err = -FI_ENOMR;
char buf[16];
struct iovec iov = {
.iov_base = buf,
.iov_len = sizeof buf
};

/**
* TODO: Ideally we should mock efa_rdm_ope_post_remote_read_or_queue here,
* but this function is currently cannot be mocked as it is at the same file
* with efa_rdm_rxe_post_local_read_or_queue, see this restriction in
* prov/efa/test/README.md's mocking session
*/
g_efa_unit_test_mocks.efa_rdm_pke_read = &efa_mock_efa_rdm_pke_read_return_mock;

efa_unit_test_resource_construct(resource, FI_EP_RDM);

efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);

/* Fake a rdma read enabled device */
efa_rdm_ep_domain(efa_rdm_ep)->device->max_rdma_size = efa_env.efa_read_segment_size;

pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL);
assert_non_null(pkt_entry);
pkt_entry->payload = pkt_entry->wiredata;

rxe = efa_rdm_ep_alloc_rxe(efa_rdm_ep, FI_ADDR_UNSPEC, ofi_op_tagged);
cuda_mr.peer.iface = FI_HMEM_CUDA;

rxe->desc[0] = &cuda_mr;
rxe->iov_count = 1;
rxe->iov[0] = iov;
pkt_entry->ope = rxe;

assert_true(dlist_empty(&efa_rdm_ep->txe_list));
will_return(efa_mock_efa_rdm_pke_read_return_mock, expected_err);
assert_int_equal(efa_rdm_rxe_post_local_read_or_queue(rxe, 0, pkt_entry, pkt_entry->payload, 16), expected_err);
/* Make sure txe is cleaned for a failed read */
assert_true(dlist_empty(&efa_rdm_ep->txe_list));

efa_rdm_pke_release_rx(pkt_entry);
}
2 changes: 2 additions & 0 deletions prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ static int efa_unit_test_mocks_teardown(void **state)
.ofi_cudaMalloc = __real_ofi_cudaMalloc,
#endif
.ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov,
.efa_rdm_pke_read = __real_efa_rdm_pke_read,
.ibv_is_fork_initialized = __real_ibv_is_fork_initialized,
};

Expand Down Expand Up @@ -164,6 +165,7 @@ int main(void)
cmocka_unit_test_setup_teardown(test_efa_rdm_ope_prepare_to_post_send_cuda_memory_align128, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ope_post_write_0_byte,
efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_rxe_post_local_read_or_queue_cleanup_txe, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_msg_send_to_local_peer_with_null_desc, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_fork_support_request_initialize_when_ibv_fork_support_is_needed, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_fork_support_request_initialize_when_ibv_fork_support_is_unneeded, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ void test_efa_rdm_ope_prepare_to_post_send_host_memory_align128();
void test_efa_rdm_ope_prepare_to_post_send_cuda_memory();
void test_efa_rdm_ope_prepare_to_post_send_cuda_memory_align128();
void test_efa_rdm_ope_post_write_0_byte();
void test_efa_rdm_rxe_post_local_read_or_queue_cleanup_txe();
void test_efa_rdm_msg_send_to_local_peer_with_null_desc();
void test_efa_fork_support_request_initialize_when_ibv_fork_support_is_needed();
void test_efa_fork_support_request_initialize_when_ibv_fork_support_is_unneeded();
Expand Down

0 comments on commit a78271c

Please sign in to comment.