From acc2e4e0b92e7461b45cea44b2b215b545817dae Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Sun, 21 Jan 2018 10:36:57 +0900 Subject: [PATCH 1/2] oshmem: add some useless but inoffensive communications and see how oshmem+ucx handles that Signed-off-by: Gilles Gouaillardet --- oshmem/runtime/oshmem_shmem_init.c | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/oshmem/runtime/oshmem_shmem_init.c b/oshmem/runtime/oshmem_shmem_init.c index ae58e837693..15ebe0af805 100644 --- a/oshmem/runtime/oshmem_shmem_init.c +++ b/oshmem/runtime/oshmem_shmem_init.c @@ -51,6 +51,8 @@ #include "opal/mca/allocator/base/base.h" #include "ompi/proc/proc.h" #include "ompi/runtime/mpiruntime.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/request/request.h" #include "oshmem/constants.h" #include "oshmem/runtime/runtime.h" @@ -152,6 +154,74 @@ int oshmem_shmem_init(int argc, char **argv, int requested, int *provided) return ret; } + { + int world_rank, world_size; + int *prank, *ranks; + ompi_request_t **reqs, *req; + ompi_status_public_t status, *statuses; + PMPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + PMPI_Comm_size(MPI_COMM_WORLD, &world_size); + prank = (int *)malloc(sizeof(int)); + *prank = world_rank; + if (0 == world_rank) { + ranks = (int *)calloc(world_size-1, sizeof(int)); + reqs = (ompi_request_t **)calloc(world_size-1, sizeof(ompi_request_t *)); + statuses = (ompi_status_public_t *)calloc(world_size-1, sizeof(ompi_status_public_t)); + fprintf (stderr, "oshmem_shmem_init: isend/irecv\n"); + for (int i=0; ireq_status.MPI_ERROR) + fprintf(stderr, "rank %d: irecv failed with request error %d\n", i+1, reqs[i]->req_status.MPI_ERROR); + if (i+1 != ranks[i]) fprintf(stderr, "irecv(): expected %d got %d\n", i+1, ranks[i]); + } + fprintf(stderr, "isend/irecv done\n"); + } else { + MCA_PML_CALL(isend(prank, 1, MPI_INT, 0, -6667, MCA_PML_BASE_SEND_STANDARD, MPI_COMM_WORLD, &req)); + ompi_request_wait(&req, &status); + if (MPI_SUCCESS != status.MPI_ERROR) + fprintf(stderr, "rank %d: isend failed with status error %d\n", world_rank, status.MPI_ERROR); + if (MPI_SUCCESS != req->req_status.MPI_ERROR) + fprintf(stderr, "rank %d: isend failed with request error %d\n", world_rank, status.MPI_ERROR); + } + } + + { + int world_rank, world_size; + int *prank, *ranks; + MPI_Request req, *reqs; + MPI_Status status; + PMPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + PMPI_Comm_size(MPI_COMM_WORLD, &world_size); + prank = (int *)malloc(sizeof(int)); + *prank = world_rank + 1; + if (0 == world_rank) ranks = (int *)calloc(world_size, sizeof(int)); + if (0 == world_rank) reqs = (MPI_Request *)calloc(world_size, sizeof(MPI_Request)); + if (0 == world_rank) fprintf (stderr, "oshmem_shmem_init: PMPI_Gather\n"); + if (0 == world_rank) fprintf (stderr, "oshmem_shmem_init: PMPI_Gather\n"); + PMPI_Gather(prank, 1, MPI_INT, ranks, 1, MPI_INT, 0, MPI_COMM_WORLD); + if (0 == world_rank) + for (int i=0; i Date: Sun, 14 Jan 2018 13:13:53 +0900 Subject: [PATCH 2/2] coll/libnbc: do not recursively call opal_progress() instead of invoking ompi_request_test_all(), that will end up calling opal_progress() recursively, manually check the status of the requests. the same method is used in ompi_comm_request_progress() Refs open-mpi/ompi#3901 Signed-off-by: Gilles Gouaillardet --- ompi/mca/coll/libnbc/nbc.c | 43 +++++++++++++------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c index 28f022e5c99..16eed3b1fb8 100644 --- a/ompi/mca/coll/libnbc/nbc.c +++ b/ompi/mca/coll/libnbc/nbc.c @@ -10,7 +10,7 @@ * rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015-2017 Research Organization for Information Science + * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler @@ -310,7 +310,8 @@ static inline void NBC_Free (NBC_Handle* handle) { * * to be called *only* from the progress thread !!! */ int NBC_Progress(NBC_Handle *handle) { - int flag, res, ret=NBC_CONTINUE; + int res, ret=NBC_CONTINUE; + bool flag; unsigned long size = 0; char *delim; int i; @@ -321,43 +322,27 @@ int NBC_Progress(NBC_Handle *handle) { return NBC_OK; } + flag = true; + if ((handle->req_count > 0) && (handle->req_array != NULL)) { NBC_DEBUG(50, "NBC_Progress: testing for %i requests\n", handle->req_count); #ifdef NBC_TIMING Test_time -= MPI_Wtime(); #endif - res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE); - if(res != OMPI_SUCCESS) { - // Attempt to cancel outstanding requests - for(i = 0; i < handle->req_count; ++i ) { - // If the request is complete, then try to report the error code - if( handle->req_array[i]->req_complete ) { - if( OMPI_SUCCESS != handle->req_array[i]->req_status.MPI_ERROR ) { - NBC_Error ("MPI Error in MPI_Testall() (req %d = %d)", i, handle->req_array[i]->req_status.MPI_ERROR); - } - } - else { - ompi_request_cancel(handle->req_array[i]); - // If the PML actually canceled the request, then wait on it - if( handle->req_array[i]->req_status._cancelled) { - ompi_request_wait(&handle->req_array[i], &status); - } - // Warn the user that we had to leave a PML message outstanding so - // bad things could happen if they continue using nonblocking collectives - else { - NBC_Error ("MPI Error: Not able to cancel the internal request %d. " - "Be aware that continuing to use nonblocking collectives on this communicator may result in undefined behavior.", i); - } + /* don't call ompi_request_test_all as it causes a recursive call into opal_progress */ + while (handle->req_count) { + ompi_request_t *subreq = handle->req_array[handle->req_count - 1]; + if (REQUEST_COMPLETE(subreq)) { + ompi_request_free(&subreq); + handle->req_count--; + } else { + flag = false; + break; } - } - - return OMPI_ERROR; } #ifdef NBC_TIMING Test_time += MPI_Wtime(); #endif - } else { - flag = 1; /* we had no open requests -> proceed to next round */ } /* a round is finished */