Skip to content

Commit

Permalink
PML/UCX: improved error processing in MPI_Recv
Browse files Browse the repository at this point in the history
- improved error processing in MPI_Recv implementation
  of pml UCX
- added error handling for pml_ucx_mrecv call

Signed-off-by: Sergey Oblomov <sergeyo@nvidia.com>
  • Loading branch information
Sergey Oblomov authored and Sergey Oblomov committed Nov 2, 2020
1 parent 487bbf3 commit eb9405d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
8 changes: 4 additions & 4 deletions ompi/mca/pml/ucx/pml_ucx.c
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src
ucp_tag_t ucp_tag, ucp_tag_mask;
ucp_tag_recv_info_t info;
ucs_status_t status;
int result;

PML_UCX_TRACE_RECV("%s", buf, count, datatype, src, tag, comm, "recv");

Expand All @@ -627,15 +628,15 @@ int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src
MCA_COMMON_UCX_PROGRESS_LOOP(ompi_pml_ucx.ucp_worker) {
status = ucp_request_test(req, &info);
if (status != UCS_INPROGRESS) {
mca_pml_ucx_set_recv_status_safe(mpi_status, status, &info);
result = mca_pml_ucx_set_recv_status_safe(mpi_status, status, &info);

#if SPC_ENABLE == 1
size_t dt_size;
ompi_datatype_type_size(datatype, &dt_size);
SPC_USER_OR_MPI(tag, dt_size*count,
OMPI_SPC_BYTES_RECEIVED_USER, OMPI_SPC_BYTES_RECEIVED_MPI);
#endif
return OMPI_SUCCESS;
return result;
}
}
}
Expand Down Expand Up @@ -1093,8 +1094,7 @@ int mca_pml_ucx_mrecv(void *buf, size_t count, ompi_datatype_t *datatype,

PML_UCX_MESSAGE_RELEASE(message);

ompi_request_wait(&req, status);
return OMPI_SUCCESS;
return ompi_request_wait(&req, status);
}

int mca_pml_ucx_start(size_t count, ompi_request_t** requests)
Expand Down
18 changes: 13 additions & 5 deletions ompi/mca/pml/ucx/pml_ucx_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ static inline void mca_pml_ucx_set_send_status(ompi_status_public_t* mpi_status,
}
}

static inline void mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
ucs_status_t ucp_status,
const ucp_tag_recv_info_t *info)
{
Expand All @@ -186,15 +186,23 @@ static inline void mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
} else {
mpi_status->MPI_ERROR = MPI_ERR_INTERN;
}

return mpi_status->MPI_ERROR;
}

static inline void mca_pml_ucx_set_recv_status_safe(ompi_status_public_t* mpi_status,
ucs_status_t ucp_status,
const ucp_tag_recv_info_t *info)
static inline int mca_pml_ucx_set_recv_status_safe(ompi_status_public_t* mpi_status,
ucs_status_t ucp_status,
const ucp_tag_recv_info_t *info)
{
if (mpi_status != MPI_STATUS_IGNORE) {
mca_pml_ucx_set_recv_status(mpi_status, ucp_status, info);
return mca_pml_ucx_set_recv_status(mpi_status, ucp_status, info);
} else if (OPAL_LIKELY(ucp_status == UCS_OK) || (ucp_status == UCS_ERR_CANCELED)) {
return UCS_OK;
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
return MPI_ERR_TRUNCATE;
}

return MPI_ERR_INTERN;
}

OBJ_CLASS_DECLARATION(mca_pml_ucx_persistent_request_t);
Expand Down

0 comments on commit eb9405d

Please sign in to comment.