Skip to content

Commit

Permalink
debug trn2
Browse files Browse the repository at this point in the history
Signed-off-by: Shi Jin <sjina@amazon.com>
  • Loading branch information
shijin-aws committed Dec 18, 2024
1 parent e5fe96e commit 07476aa
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 2 deletions.
11 changes: 9 additions & 2 deletions fabtests/common/shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -3756,11 +3756,14 @@ int ft_fill_buf(void *buf, size_t size)
msg_buf = (char *) buf;
}

printf("data filled in send buffer: \n");
for (i = 0; i < size; i++) {
msg_buf[i] = integ_alphabet[msg_index];
printf("%c", msg_buf[i]);
if (++msg_index >= integ_alphabet_length)
msg_index = 0;
}
printf("\n");

if (opts.iface != FI_HMEM_SYSTEM) {
ret = ft_hmem_copy_to(opts.iface, opts.device, buf, msg_buf, size);
Expand Down Expand Up @@ -3961,6 +3964,7 @@ int ft_check_buf(void *buf, size_t size)
int msg_index = 0;
size_t i;
int ret = 0;
bool err = false;

if (opts.iface != FI_HMEM_SYSTEM) {
assert(dev_host_buf);
Expand All @@ -3973,14 +3977,17 @@ int ft_check_buf(void *buf, size_t size)
recv_data = (char *)buf;
}

printf("Received data: \n");
for (i = 0; i < size; i++) {
c = integ_alphabet[msg_index];
printf("%c", recv_data[i]);
if (++msg_index >= integ_alphabet_length)
msg_index = 0;
if (c != recv_data[i])
break;
err = true;
}
if (i != size) {
printf("\n");
if (err) {
printf("Data check error (%c!=%c) at byte %zu for "
"buffer size %zu\n", c, recv_data[i], i, size);
ret = -FI_EIO;
Expand Down
8 changes: 8 additions & 0 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,13 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct
}

pkt_entry->pkt_size = ibv_wc_read_byte_len(ibv_cq_ex);
EFA_WARN(FI_LOG_CQ, "Get recv completion from rdma-core of size %zu\n", pkt_entry->pkt_size);
if (pkt_entry->pkt_size == 84) {
printf("Receive payload data: \n");
for (int i=0; i<64; i++)
printf("%c", *((char *)pkt_entry->wiredata + i + 20));
printf("\n");
}
if (ibv_wc_read_wc_flags(ibv_cq_ex) & IBV_WC_WITH_IMM) {
has_imm_data = true;
imm_data = ibv_wc_read_imm_data(ibv_cq_ex);
Expand Down Expand Up @@ -516,6 +523,7 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq)
efa_rdm_pke_handle_send_completion(pkt_entry);
break;
case IBV_WC_RECV:

efa_rdm_cq_handle_recv_completion(ibv_cq, pkt_entry, ep);
#if ENABLE_DEBUG
ep->recv_comps++;
Expand Down
6 changes: 6 additions & 0 deletions prov/efa/src/rdm/efa_rdm_pke_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke,
mr_p2p_available = false;
use_inline_buf = false;

EFA_WARN(FI_LOG_EP_DATA, "pkt hdr size %zu, payload size %zu\n", payload_offset, data_size);
if (iov_mr) {
ret = efa_rdm_ep_use_p2p(pke->ep, iov_mr);
if (ret < 0)
Expand Down Expand Up @@ -465,6 +466,11 @@ ssize_t efa_rdm_pke_copy_payload_to_ope(struct efa_rdm_pke *pke,
if (efa_mr_is_cuda(desc))
return efa_rdm_pke_copy_payload_to_cuda(pke, ope);

if (efa_mr_is_neuron(desc)) {
EFA_WARN(FI_LOG_EP_DATA, "use local rdma read to copy data from bounce buffer to neuron buffer, payload size %zu\n", pke->payload_size);
return efa_rdm_rxe_post_local_read_or_queue(ope, segment_offset, pke, pke->payload, pke->payload_size);
}

if (efa_mr_is_hmem(desc))
return efa_rdm_pke_queued_copy_payload_to_hmem(pke, ope);

Expand Down

0 comments on commit 07476aa

Please sign in to comment.