From e3ad86eeba8cbdf62471c16598f06073baae093e Mon Sep 17 00:00:00 2001 From: Akshay Venkatesh Date: Tue, 1 Oct 2024 17:18:17 +0000 Subject: [PATCH] ompi/coll/cuda: Implement reduce local Signed-off-by: Akshay Venkatesh (cherry picked from commit e25e89768b6cd8b55c7e0095df6940df11209780) --- ompi/mca/coll/cuda/coll_cuda.h | 5 +++ ompi/mca/coll/cuda/coll_cuda_module.c | 3 ++ ompi/mca/coll/cuda/coll_cuda_reduce.c | 57 +++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/ompi/mca/coll/cuda/coll_cuda.h b/ompi/mca/coll/cuda/coll_cuda.h index 58f9ec4ae5b..afedc632ee5 100644 --- a/ompi/mca/coll/cuda/coll_cuda.h +++ b/ompi/mca/coll/cuda/coll_cuda.h @@ -54,6 +54,11 @@ int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); +int mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, size_t count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + mca_coll_base_module_t *module); + int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, diff --git a/ompi/mca/coll/cuda/coll_cuda_module.c b/ompi/mca/coll/cuda/coll_cuda_module.c index b8f2ddf5e90..7d3ccd23dcb 100644 --- a/ompi/mca/coll/cuda/coll_cuda_module.c +++ b/ompi/mca/coll/cuda/coll_cuda_module.c @@ -40,6 +40,7 @@ static void mca_coll_cuda_module_destruct(mca_coll_cuda_module_t *module) { OBJ_RELEASE(module->c_coll.coll_allreduce_module); OBJ_RELEASE(module->c_coll.coll_reduce_module); + OBJ_RELEASE(module->c_coll.coll_reduce_local_module); OBJ_RELEASE(module->c_coll.coll_reduce_scatter_block_module); OBJ_RELEASE(module->c_coll.coll_scatter_module); /* If the exscan module is not NULL, then this was an @@ -103,6 +104,7 @@ mca_coll_cuda_comm_query(struct ompi_communicator_t *comm, cuda_module->super.coll_gather = NULL; cuda_module->super.coll_gatherv = NULL; cuda_module->super.coll_reduce = mca_coll_cuda_reduce; + cuda_module->super.coll_reduce_local = mca_coll_cuda_reduce_local; cuda_module->super.coll_reduce_scatter = NULL; cuda_module->super.coll_reduce_scatter_block = mca_coll_cuda_reduce_scatter_block; cuda_module->super.coll_scan = mca_coll_cuda_scan; @@ -135,6 +137,7 @@ int mca_coll_cuda_module_enable(mca_coll_base_module_t *module, CHECK_AND_RETAIN(comm, s, allreduce); CHECK_AND_RETAIN(comm, s, reduce); + CHECK_AND_RETAIN(comm, s, reduce_local); CHECK_AND_RETAIN(comm, s, reduce_scatter_block); CHECK_AND_RETAIN(comm, s, scatter); if (!OMPI_COMM_IS_INTER(comm)) { diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c index d8a6cef1419..7743a078743 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c @@ -83,3 +83,60 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, } return rc; } + +int +mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, size_t count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + mca_coll_base_module_t *module) +{ + ptrdiff_t gap; + char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; + size_t bufsize; + int rc; + + bufsize = opal_datatype_span(&dtype->super, count, &gap); + + rc = mca_coll_cuda_check_buf((void *)sbuf); + if (rc < 0) { + return rc; + } + + if ((MPI_IN_PLACE != sbuf) && (rc > 0)) { + sbuf1 = (char*)malloc(bufsize); + if (NULL == sbuf1) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize); + sbuf = sbuf1 - gap; + } + + rc = mca_coll_cuda_check_buf(rbuf); + if (rc < 0) { + return rc; + } + + if (rc > 0) { + rbuf1 = (char*)malloc(bufsize); + if (NULL == rbuf1) { + if (NULL != sbuf1) free(sbuf1); + return OMPI_ERR_OUT_OF_RESOURCE; + } + mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize); + rbuf2 = rbuf; /* save away original buffer */ + rbuf = rbuf1 - gap; + } + + ompi_op_reduce(op, (void *)sbuf, rbuf, count, dtype); + rc = OMPI_SUCCESS; + + if (NULL != sbuf1) { + free(sbuf1); + } + if (NULL != rbuf1) { + rbuf = rbuf2; + mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize); + free(rbuf1); + } + return rc; +}