Skip to content

Commit

Permalink
prov/efa: release srx lock before inserting shm av
Browse files Browse the repository at this point in the history
Signed-off-by: Shi Jin <sjina@amazon.com>
  • Loading branch information
shijin-aws committed Oct 19, 2023
1 parent 6c28231 commit 1f6ef98
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 28 deletions.
28 changes: 1 addition & 27 deletions prov/efa/src/efa_av.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,32 +42,6 @@
#include "efa_av.h"
#include "rdm/efa_rdm_pke_utils.h"

/*
* Local/remote peer detection by comparing peer GID with stored local GIDs
*/
static bool efa_is_local_peer(struct efa_av *av, const void *addr)
{
int i;
uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw;

#if ENABLE_DEBUG
char raw_gid_str[INET6_ADDRSTRLEN] = { 0 };

if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) {
EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno);
return 0;
}
EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str);
#endif
for (i = 0; i < g_device_cnt; ++i) {
if (!memcmp(raw_gid, g_device_list[i].ibv_gid.raw, EFA_GID_LEN)) {
EFA_INFO(FI_LOG_AV, "The peer is local.\n");
return 1;
}
}

return 0;
}

/**
* @brief find efa_conn struct using fi_addr
Expand Down Expand Up @@ -306,7 +280,7 @@ int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn)
efa_rdm_peer_construct(peer, efa_rdm_ep, conn);

/* If peer is local, insert the address into shm provider's av */
if (efa_is_local_peer(av, conn->ep_addr) && av->shm_rdm_av) {
if (efa_is_local_peer(conn->ep_addr) && av->shm_rdm_av) {
if (av->shm_used >= efa_env.shm_av_size) {
EFA_WARN(FI_LOG_AV,
"Max number of shm AV entry (%d) has been reached.\n",
Expand Down
27 changes: 27 additions & 0 deletions prov/efa/src/efa_av.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,31 @@ fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, uint16_t qp

fi_addr_t efa_av_reverse_lookup_dgram(struct efa_av *av, uint16_t ahn, uint16_t qpn);

/*
* Local/remote peer detection by comparing peer GID with stored local GIDs
*/
static inline bool efa_is_local_peer(const void *addr)
{
int i;
uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw;

#if ENABLE_DEBUG
char raw_gid_str[INET6_ADDRSTRLEN] = { 0 };

if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) {
EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno);
return 0;
}
EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str);
#endif
for (i = 0; i < g_device_cnt; ++i) {
if (!memcmp(raw_gid, g_device_list[i].ibv_gid.raw, EFA_GID_LEN)) {
EFA_INFO(FI_LOG_AV, "The peer is local.\n");
return 1;
}
}

return 0;
}

#endif
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_ep_fiops.c
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,7 @@ void efa_rdm_ep_set_use_shm_for_tx(struct efa_rdm_ep *ep)
return;
}

ep->use_shm_for_tx = efa_env.enable_shm_transfer;
ep->use_shm_for_tx = false; //efa_env.enable_shm_transfer;
return;
}

Expand Down
17 changes: 17 additions & 0 deletions prov/efa/src/rdm/efa_rdm_pke_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -717,8 +717,13 @@ fi_addr_t efa_rdm_pke_insert_addr(struct efa_rdm_pke *pkt_entry, void *raw_addr)
fi_addr_t rdm_addr;
struct efa_rdm_ep *ep;
struct efa_rdm_base_hdr *base_hdr;
bool is_local_peer;
struct ofi_genlock *lock;


ep = pkt_entry->ep;
lock = efa_rdm_ep_get_peer_srx_ctx(ep)->lock;
assert(ofi_genlock_held(lock));

base_hdr = efa_rdm_pke_get_base_hdr(pkt_entry);
if (base_hdr->version < EFA_RDM_PROTOCOL_VERSION) {
Expand All @@ -739,8 +744,20 @@ fi_addr_t efa_rdm_pke_insert_addr(struct efa_rdm_pke *pkt_entry, void *raw_addr)

assert(base_hdr->type >= EFA_RDM_REQ_PKT_BEGIN);

/*
* efa_av_insert_one will insert shm av for local peer.
* shm's fi_av_insert will acquire a srx_ctx lock
* to process the unspec_unexp_msg_queue, we need to
* release this lock inside efa's progress engine
* before calling it.
*/
is_local_peer = efa_is_local_peer(raw_addr);
if (is_local_peer)
ofi_genlock_unlock(lock);
ret = efa_av_insert_one(ep->base_ep.av, (struct efa_ep_addr *)raw_addr,
&rdm_addr, 0, NULL);
if (is_local_peer)
ofi_genlock_lock(lock);
if (OFI_UNLIKELY(ret != 0)) {
efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_AV_INSERT);
return -1;
Expand Down

0 comments on commit 1f6ef98

Please sign in to comment.