Skip to content

Commit

Permalink
Add CUDA/HIP implementations of reduction operators
Browse files Browse the repository at this point in the history
The operators are generated from macros. Function pointers to
kernel launch functions are stored inside the ompi_op_t as a
pointer to a struct that is filled if accelerator support is available.

The ompi_op* API is extended to include versions taking streams and device
IDs to allow enqueuing operators on streams. The old functions map
to the stream versions with a NULL stream.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
  • Loading branch information
devreal committed Jun 18, 2024
1 parent 55c0bda commit 3ab3371
Show file tree
Hide file tree
Showing 24 changed files with 8,708 additions and 48 deletions.
120 changes: 120 additions & 0 deletions config/opal_check_cudart.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
dnl -*- autoconf -*-
dnl
dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
dnl University Research and Technology
dnl Corporation. All rights reserved.
dnl Copyright (c) 2004-2005 The University of Tennessee and The University
dnl of Tennessee Research Foundation. All rights
dnl reserved.
dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
dnl University of Stuttgart. All rights reserved.
dnl Copyright (c) 2004-2005 The Regents of the University of California.
dnl All rights reserved.
dnl Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
dnl reserved.
dnl Copyright (c) 2009-2011 Oak Ridge National Labs. All rights reserved.
dnl Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
dnl Copyright (c) 2015 Research Organization for Information Science
dnl and Technology (RIST). All rights reserved.
dnl Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl


# OPAL_CHECK_CUDART(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if CUDA runtime library support can be found. sets prefix_{CPPFLAGS,
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found

#
# Check for CUDA support
#
AC_DEFUN([OPAL_CHECK_CUDART],[
OPAL_VAR_SCOPE_PUSH([cudart_save_CPPFLAGS cudart_save_LDFLAGS cudart_save_LIBS])
cudart_save_CPPFLAGS="$CPPFLAGS"
cudart_save_LDFLAGS="$LDFLAGS"
cudart_save_LIBS="$LIBS"
#
# Check to see if the user provided paths for CUDART
#
AC_ARG_WITH([cudart],
[AS_HELP_STRING([--with-cudart=DIR],
[Path to the CUDA runtime library and header files])])
AC_MSG_CHECKING([if --with-cudart is set])
AC_ARG_WITH([cudart-libdir],
[AS_HELP_STRING([--with-cudart-libdir=DIR],
[Search for CUDA runtime libraries in DIR])])
####################################
#### Check for CUDA runtime library
####################################
AS_IF([test "x$with_cudart" != "xno" || test "x$with_cudart" = "x"],
[opal_check_cudart_happy=no
AC_MSG_RESULT([not set (--with-cudart=$with_cudart)])],
[AS_IF([test ! -d "$with_cudart"],
[AC_MSG_RESULT([not found])
AC_MSG_WARN([Directory $with_cudart not found])]
[AS_IF([test "x`ls $with_cudart/include/cuda_runtime.h 2> /dev/null`" = "x"]
[AC_MSG_RESULT([not found])
AC_MSG_WARN([Could not find cuda_runtime.h in $with_cudart/include])]
[opal_check_cudart_happy=yes
opal_cudart_incdir="$with_cudart/include"])])])
AS_IF([test "$opal_check_cudart_happy" = "no" && test "$with_cudart" != "no"],
[AC_PATH_PROG([nvcc_bin], [nvcc], ["not-found"])
AS_IF([test "$nvcc_bin" = "not-found"],
[AC_MSG_WARN([Could not find nvcc binary])],
[nvcc_dirname=`AS_DIRNAME([$nvcc_bin])`
with_cudart=$nvcc_dirname/../
opal_cudart_incdir=$nvcc_dirname/../include
opal_check_cudart_happy=yes])
]
[])
AS_IF([test x"$with_cudart_libdir" = "x"],
[with_cudart_libdir=$with_cudart/lib64/]
[])
AS_IF([test "$opal_check_cudart_happy" = "yes"],
[OAC_CHECK_PACKAGE([cudart],
[$1],
[cuda_runtime.h],
[cudart],
[cudaMalloc],
[opal_check_cudart_happy="yes"],
[opal_check_cudart_happy="no"])],
[])
AC_MSG_CHECKING([if have cuda runtime library support])
if test "$opal_check_cudart_happy" = "yes"; then
AC_MSG_RESULT([yes (-I$opal_cudart_incdir)])
CUDART_SUPPORT=1
common_cudart_CPPFLAGS="-I$opal_cudart_incdir"
AC_SUBST([common_cudart_CPPFLAGS])
else
AC_MSG_RESULT([no])
CUDART_SUPPORT=0
fi
OPAL_SUMMARY_ADD([Accelerators], [CUDART support], [], [$opal_check_cudart_happy])
AM_CONDITIONAL([OPAL_cudart_support], [test "x$CUDART_SUPPORT" = "x1"])
AC_DEFINE_UNQUOTED([OPAL_CUDART_SUPPORT],$CUDART_SUPPORT,
[Whether we have cuda runtime library support])
CPPFLAGS=${cudart_save_CPPFLAGS}
LDFLAGS=${cudart_save_LDFLAGS}
LIBS=${cudart_save_LIBS}
OPAL_VAR_SCOPE_POP
])dnl
4 changes: 3 additions & 1 deletion ompi/mca/op/base/op_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -42,6 +42,7 @@ static void module_constructor(ompi_op_base_module_t *m)
{
m->opm_enable = NULL;
m->opm_op = NULL;
m->opm_device_enabled = false;
memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
}
Expand All @@ -50,6 +51,7 @@ static void module_constructor_1_0_0(ompi_op_base_module_1_0_0_t *m)
{
m->opm_enable = NULL;
m->opm_op = NULL;
m->opm_device_enabled = false;
memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
}
Expand Down
60 changes: 44 additions & 16 deletions ompi/mca/op/base/op_base_op_select.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -152,22 +152,50 @@ int ompi_op_base_op_select(ompi_op_t *op)
}

/* Copy over the non-NULL pointers */
for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
/* 2-buffer variants */
if (NULL != avail->ao_module->opm_fns[i]) {
OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
op->o_func.intrinsic.modules[i] = avail->ao_module;
OBJ_RETAIN(avail->ao_module);
if (avail->ao_module->opm_device_enabled) {
if (NULL == op->o_device_op) {
op->o_device_op = calloc(1, sizeof(*op->o_device_op));
}

/* 3-buffer variants */
if (NULL != avail->ao_module->opm_3buff_fns[i]) {
OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
op->o_3buff_intrinsic.fns[i] =
avail->ao_module->opm_3buff_fns[i];
op->o_3buff_intrinsic.modules[i] = avail->ao_module;
OBJ_RETAIN(avail->ao_module);
for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
/* 2-buffer variants */
if (NULL != avail->ao_module->opm_stream_fns[i]) {
if (NULL != op->o_device_op->do_intrinsic.modules[i]) {
OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]);
}
op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_stream_fns[i];
op->o_device_op->do_intrinsic.modules[i] = avail->ao_module;
OBJ_RETAIN(avail->ao_module);
}

/* 3-buffer variants */
if (NULL != avail->ao_module->opm_3buff_stream_fns[i]) {
if (NULL != op->o_device_op->do_3buff_intrinsic.modules[i]) {
OBJ_RELEASE(op->o_device_op->do_3buff_intrinsic.modules[i]);
}
op->o_device_op->do_3buff_intrinsic.fns[i] =
avail->ao_module->opm_3buff_stream_fns[i];
op->o_device_op->do_3buff_intrinsic.modules[i] = avail->ao_module;
OBJ_RETAIN(avail->ao_module);
}
}
} else {
for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
/* 2-buffer variants */
if (NULL != avail->ao_module->opm_fns[i]) {
OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
op->o_func.intrinsic.modules[i] = avail->ao_module;
OBJ_RETAIN(avail->ao_module);
}

/* 3-buffer variants */
if (NULL != avail->ao_module->opm_3buff_fns[i]) {
OBJ_RELEASE(op->o_3buff_intrinsic.modules[i]);
op->o_3buff_intrinsic.fns[i] =
avail->ao_module->opm_3buff_fns[i];
op->o_3buff_intrinsic.modules[i] = avail->ao_module;
OBJ_RETAIN(avail->ao_module);
}
}
}

Expand Down
84 changes: 84 additions & 0 deletions ompi/mca/op/cuda/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#
# Copyright (c) 2023 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

# This component provides support for offloading reduce ops to CUDA devices.
#
# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
# for more details on how to make Open MPI components.

# First, list all .h and .c sources. It is necessary to list all .h
# files so that they will be picked up in the distribution tarball.

AM_CPPFLAGS = $(op_cuda_CPPFLAGS) $(op_cudart_CPPFLAGS)

dist_ompidata_DATA = help-ompi-mca-op-cuda.txt

sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
#sources_extended = op_cuda_functions.cu
cu_sources = op_cuda_impl.cu

NVCC = nvcc -g
NVCCFLAGS= --std c++17 --gpu-architecture=compute_52

.cu.l$(OBJEXT):
$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(NVCC) -prefer-non-pic $(NVCCFLAGS) -Wc,-Xcompiler,-fPIC,-g -c $<

# -o $($@.o:.lo)

# Open MPI components can be compiled two ways:
#
# 1. As a standalone dynamic shared object (DSO), sometimes called a
# dynamically loadable library (DLL).
#
# 2. As a static library that is slurped up into the upper-level
# libmpi library (regardless of whether libmpi is a static or dynamic
# library). This is called a "Libtool convenience library".
#
# The component needs to create an output library in this top-level
# component directory, and named either mca_<type>_<name>.la (for DSO
# builds) or libmca_<type>_<name>.la (for static builds). The OMPI
# build system will have set the
# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
# which way this component should be built.

if MCA_BUILD_ompi_op_cuda_DSO
component_install = mca_op_cuda.la
else
component_install =
component_noinst = libmca_op_cuda.la
endif

# Specific information for DSO builds.
#
# The DSO should install itself in $(ompilibdir) (by default,
# $prefix/lib/openmpi).

mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_op_cuda_la_SOURCES = $(sources)
mca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
$(op_cuda_LIBS) $(op_cudart_LDFLAGS) $(op_cudart_LIBS)
EXTRA_mca_op_cuda_la_SOURCES = $(cu_sources)

# Specific information for static builds.
#
# Note that we *must* "noinst"; the upper-layer Makefile.am's will
# slurp in the resulting .la library into libmpi.

noinst_LTLIBRARIES = $(component_noinst)
libmca_op_cuda_la_SOURCES = $(sources)
libmca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
libmca_op_cuda_la_LDFLAGS = -module -avoid-version\
$(op_cuda_LIBS) $(op_cudart_LDFLAGS) $(op_cudart_LIBS)
EXTRA_libmca_op_cuda_la_SOURCES = $(cu_sources)

41 changes: 41 additions & 0 deletions ompi/mca/op/cuda/configure.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# -*- shell-script -*-
#
# Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
# Copyright (c) 2023 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
# All Rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

#
# If CUDA support was requested, then build the CUDA support library.
# This code checks makes sure the check was done earlier by the
# opal_check_cuda.m4 code. It also copies the flags and libs under
# opal_cuda_CPPFLAGS, opal_cuda_LDFLAGS, and opal_cuda_LIBS

AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[

AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile])

OPAL_CHECK_CUDA([op_cuda])
OPAL_CHECK_CUDART([op_cudart])

AS_IF([test "x$CUDA_SUPPORT" = "x1"],
[$1],
[$2])

AC_SUBST([op_cuda_CPPFLAGS])
AC_SUBST([op_cuda_LDFLAGS])
AC_SUBST([op_cuda_LIBS])

AC_SUBST([op_cudart_CPPFLAGS])
AC_SUBST([op_cudart_LDFLAGS])
AC_SUBST([op_cudart_LIBS])

])dnl
15 changes: 15 additions & 0 deletions ompi/mca/op/cuda/help-ompi-mca-op-cuda.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- text -*-
#
# Copyright (c) 2023 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI's CUDA operator component
#
[CUDA call failed]
"CUDA call %s failed: %s: %s\n"
Loading

0 comments on commit 3ab3371

Please sign in to comment.