Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DirectVerbs] Make the code compilable without exp-verbs #103

Open
wants to merge 39 commits into
base: devel-dv
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
ce78eb6
Added --with-spectrum-mpi to configure
pakmarkthub Aug 4, 2021
66409ac
Introduced GDS_DRIVER_TYPE to gds_qp and gds_cq
pakmarkthub Aug 5, 2021
fdd2270
Set gds_qp and gds_cq dtype to MLX5_EXP in the creation functions
pakmarkthub Aug 5, 2021
0073f81
Changed the definition of gds_qp_init_attr_t and gds_send_wr. Also, r…
pakmarkthub Aug 5, 2021
78eadbb
Implemented gds_get_driver_type
pakmarkthub Aug 5, 2021
aade2ab
Initial implementation of mlx5-exp.cpp/hpp and moved create/destroy q…
pakmarkthub Aug 5, 2021
5e789db
Implemented gds_mlx5_exp_destroy_cq
pakmarkthub Aug 5, 2021
2a8ac7d
Implemented gds_destroy_qp and gds_destroy_cq by connecting to gds_ml…
pakmarkthub Aug 5, 2021
a9351f2
Reimplemented gds_create_qp by connecting to gds_mlx5_exp_create_qp
pakmarkthub Aug 5, 2021
e188668
Moved gds_send_wr from exp-verbs to ib-verbs in gds_kernel_latency.c
pakmarkthub Aug 5, 2021
07d03a0
Fixed compile issues in gdsync.cpp and mlx5-exp.hpp
pakmarkthub Aug 5, 2021
173841f
Added mlx5-exp.cpp to the compile list
pakmarkthub Aug 6, 2021
8d1ed50
Moved gds_prepare_send to gds_mlx5_exp_* and fixed compile errors in …
pakmarkthub Aug 6, 2021
10e164e
Modified gds_kernel_* applications to fit the new API/structs
pakmarkthub Aug 6, 2021
b1cab05
Moved gds_wait_request to gds_mlx5_exp_wait_request and made the form…
pakmarkthub Aug 11, 2021
17b246f
Moved gds_wait_request related functions to mlx5-exp
pakmarkthub Aug 11, 2021
6be27ac
Changed the definition of gds_send_request_t
pakmarkthub Sep 9, 2021
f026578
Added gds_mlx5_exp_send_request_t definition and supported functions
pakmarkthub Sep 9, 2021
92b09f0
Moved APIs that use gds_send_request_t to mlx5-exp.cpp
pakmarkthub Sep 9, 2021
e088b01
Removed include verbs_exp.h and peer_ops.h from public header files
pakmarkthub Sep 10, 2021
942aa89
Replaced structs related to ibv_exp_ with gds_* structs
pakmarkthub Sep 10, 2021
35db866
Removed include verbs_exp.h and peer_ops.h from objs.hpp
pakmarkthub Sep 10, 2021
24a0a20
Removed include verbs_exp.h from objs.cpp
pakmarkthub Sep 10, 2021
5d7c328
Removed include verbs_exp.h and peer_ops.h from gdsync.cpp
pakmarkthub Sep 10, 2021
bf1a358
Relaced all IBV_EXP_ enum with GDS_ in gdsync.cpp
pakmarkthub Sep 10, 2021
f95c72d
Replaced all IBV_EXP_ enum with GDS_ in mlx5.cpp
pakmarkthub Sep 10, 2021
6a23931
Relaced all IBV_EXP_ enum with GDS_ enum in gdsync_debug_hostregister…
pakmarkthub Sep 10, 2021
ab745de
Fixed bugs in gds_dump_wait_request
pakmarkthub Sep 10, 2021
f43f905
Updated some functions in gdsync_debug_hostregister_bug.cpp to reflec…
pakmarkthub Sep 10, 2021
aeb1ed7
Removed include verbs_exp.h and peer_ops.h from all files except from…
pakmarkthub Sep 10, 2021
59d20f5
Updated configure.ac and Makefile.am to make exp-verbs optional
pakmarkthub Sep 10, 2021
bfb683b
Fixed bug in configure.ac related to checking for exp-verbs
pakmarkthub Sep 10, 2021
e61a405
Defined gds_transport_t
pakmarkthub Oct 15, 2021
8720c86
Moved mlx5-exp.* into transports/mlx5-exp and implemented gds_transpo…
pakmarkthub Oct 15, 2021
de7d52d
Fixed bugs and modified the code to use the gds_transport_t interface
pakmarkthub Oct 15, 2021
c914e89
Removed driver_type and fixed bugs
pakmarkthub Oct 15, 2021
ed703cc
Added exp-verbs support checking in transport.hpp
pakmarkthub Oct 15, 2021
7340483
Fixed typo in configure.ac
pakmarkthub Oct 18, 2021
9986794
Fixed compile errors when compiling on x86
pakmarkthub Oct 18, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,6 @@ libtool
# Debug files
*.dSYM/
*.su

# Editor files
*.swp
15 changes: 10 additions & 5 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,18 @@ EXTRA_DIST = autogen.sh

include_HEADERS = include/gdsync.h
libgdsyncincludedir = $(includedir)/gdsync
libgdsyncinclude_HEADERS = include/gdsync/core.h include/gdsync/device.cuh include/gdsync/mlx5.h include/gdsync/tools.h
libgdsyncinclude_HEADERS = include/gdsync/core.h include/gdsync/device.cuh include/gdsync/mlx5.h include/gdsync/tools.h

src_libgdsync_la_CFLAGS = $(AM_CFLAGS)
src_libgdsync_la_SOURCES = src/gdsync.cpp src/memmgr.cpp src/mem.cpp src/objs.cpp src/apis.cpp src/mlx5.cpp include/gdsync.h
src_libgdsync_la_LDFLAGS = -version-info @VERSION_INFO@

noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h
noinst_HEADERS = src/mem.hpp src/memmgr.hpp src/objs.hpp src/rangeset.hpp src/utils.hpp src/archutils.h src/mlnxutils.h

if COMPILE_EXP_VERBS
src_libgdsync_la_SOURCES += src/transports/mlx5-exp/mlx5-exp.cpp
noinst_HEADERS += src/transports/mlx5-exp/mlx5-exp.hpp
endif

# if enabled at configure time

Expand All @@ -36,7 +41,7 @@ bin_PROGRAMS = tests/gds_kernel_latency tests/gds_poll_lat tests/gds_kernel_loop
noinst_PROGRAMS = tests/rstest tests/wqtest

tests_gds_kernel_latency_SOURCES = tests/gds_kernel_latency.c tests/gpu_kernels.cu tests/pingpong.c tests/gpu.cpp
tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la -lmpi $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS)
tests_gds_kernel_latency_LDADD = $(top_builddir)/src/libgdsync.la $(MPILDFLAGS) $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS)

tests_rstest_SOURCES = tests/rstest.cpp
tests_rstest_LDADD =
Expand All @@ -45,10 +50,10 @@ tests_wqtest_SOURCES = tests/task_queue_test.cpp
tests_wqtest_LDADD = $(PTHREAD_LIBS)

tests_gds_poll_lat_SOURCES = tests/gds_poll_lat.c tests/gpu.cpp tests/gpu_kernels.cu
tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS)
tests_gds_poll_lat_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(MPILDFLAGS) $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS)

tests_gds_sanity_SOURCES = tests/gds_sanity.cpp tests/gpu.cpp tests/gpu_kernels.cu
tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi -lmpi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS)
tests_gds_sanity_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(MPILDFLAGS) $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS)

tests_gds_kernel_loopback_latency_SOURCES = tests/gds_kernel_loopback_latency.c tests/pingpong.c tests/gpu.cpp tests/gpu_kernels.cu
tests_gds_kernel_loopback_latency_LDADD = $(top_builddir)/src/libgdsync.la $(LIBGDSTOOLS) -lgdrapi $(LIBNVTX) -lcuda -lcudart $(PTHREAD_LIBS)
Expand Down
62 changes: 51 additions & 11 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -93,25 +93,54 @@ else
AC_SUBST(LIBGDSTOOLS)
fi

AC_ARG_WITH([mpi],
AC_HELP_STRING([--with-mpi], [ Set path to mpi installation ]))
if test x$with_mpi = x || test x$with_mpi = xno; then
AC_ARG_WITH([spectrum-mpi],
AC_HELP_STRING([--with-spectrum-mpi], [ Set path to Spectrum MPI installation ]))
if test x$with_spectrum_mpi = x || test x$with_spectrum_mpi = xno; then
# assuming system location
mpi_home=/usr
MPICC=$with_home/bin/mpicc
MPICXX=$with_home/bin/mpic++
MPICC=/bin/mpicc
MPICXX=/bin/mpic++
MPILDFLAGS="-lmpi_ibm"
else
if test -d $with_mpi; then
mpi_home=$with_mpi
if test -d $with_spectrum_mpi; then
mpi_home=$with_spectrum_mpi
MPICC=${mpi_home}/bin/mpicc
MPICXX=${mpi_home}/bin/mpic++
CPPFLAGS="$CPPFLAGS -I${mpi_home}/include"
LDFLAGS="$LDFLAGS -L${mpi_home}/lib -L${mpi_home}/lib64"
MPILDFLAGS="-lmpi_ibm"
else
echo "MPI dir does not exist"
fi
fi

AC_ARG_WITH([mpi],
AC_HELP_STRING([--with-mpi], [ Set path to MPI installation ]))
if test x$with_spectrum_mpi = x || test x$with_spectrum_mpi == xno; then
if test x$with_mpi = x || test x$with_mpi = xno; then
# assuming system location
mpi_home=/usr
MPICC=/bin/mpicc
MPICXX=/bin/mpic++
MPILDFLAGS="-lmpi"
else
if test -d $with_mpi; then
mpi_home=$with_mpi
MPICC=${mpi_home}/bin/mpicc
MPICXX=${mpi_home}/bin/mpic++
CPPFLAGS="$CPPFLAGS -I${mpi_home}/include"
LDFLAGS="$LDFLAGS -L${mpi_home}/lib -L${mpi_home}/lib64"
MPILDFLAGS="-lmpi"
else
echo "MPI dir does not exist"
fi
fi
fi

if test x$with_spectrum_mpi != x && test x$with_spectrum_mpi != xno && test x$with_mpi != x && test x$with_mpi != xno; then
AC_MSG_ERROR([--with-mpi and --with-spectrum-mpi are mutually exclusive.])
fi

dnl Specify CUDA Location
AC_ARG_WITH(cuda-toolkit,
AC_HELP_STRING([--with-cuda-toolkit=CUDATKDIR], [ Specify CUDA toolkit installation directory (default: /usr/local/cuda)]),
Expand Down Expand Up @@ -161,11 +190,21 @@ dnl Checks for Verbs support
AC_CHECK_LIB(ibverbs, ibv_get_device_list, [],
AC_MSG_ERROR([ibv_get_device_list() not found. libgdsync requires libibverbs.]))

AC_CHECK_LIB(ibverbs, ibv_exp_create_qp,
AC_MSG_ERROR([ibv_exp_create_qp not found. libgdsync requires verbs extension support.]))
dnl ibv_exp_create_qp is an inline function. So, we check for exp_cmd instead.
AC_CHECK_LIB(ibverbs, ibv_exp_cmd_create_qp, [have_exp_verbs=1])
AC_CHECK_HEADER([infiniband/peer_ops.h], [have_peer_ops=1], [],
[[
#include <infiniband/peer_ops.h>
]])

if test "x$have_exp_verbs" != "x" && test "x$have_peer_ops" != "x"; then
AC_DEFINE([HAVE_EXP_VERBS], [1], [Define if exp-verbs exists.])
enable_exp_verbs=1
else
AC_MSG_WARN([This version of libgdsync cannot be used without exp-verbs.])
fi
AM_CONDITIONAL([COMPILE_EXP_VERBS], [test "x$enable_exp_verbs" != "x"])

AC_CHECK_HEADER(infiniband/peer_ops.h, [],
AC_MSG_ERROR([<infiniband/peer_ops.h> not found. libgdsync requires verbs peer-direct support.]))
AC_HEADER_STDC

dnl Checks for typedefs, structures, and compiler characteristics.
Expand All @@ -186,6 +225,7 @@ AC_MSG_NOTICE([Setting MPI_PATH = ${mpi_home} ])
AC_SUBST( MPI_PATH, [${mpi_home} ])
AC_SUBST( MPICC, [${MPICC} ])
AC_SUBST( MPICXX, [${MPICXX} ])
AC_SUBST( MPILDFLAGS, [${MPILDFLAGS} ])

CPPFLAGS="$CPPFLAGS -I$CUDA_DRV_PATH/include -I$CUDA_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_DRV_PATH/lib64 -L$CUDA_DRV_PATH/lib -L$CUDA_PATH/lib64 -L$CUDA_PATH/lib"
Expand Down
2 changes: 0 additions & 2 deletions include/gdsync.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
*/

#include <infiniband/verbs.h>
#include <infiniband/verbs_exp.h>
#include <infiniband/peer_ops.h>

#include <cuda.h>
#include <gdrapi.h>
Expand Down
43 changes: 23 additions & 20 deletions include/gdsync/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,35 +40,34 @@
((((v) & 0x0000ffffU) >> 0 ) >= (unsigned)GDS_API_MINOR_VERSION) )

typedef enum gds_param {
GDS_PARAM_VERSION,
GDS_NUM_PARAMS
GDS_PARAM_VERSION,
GDS_NUM_PARAMS
} gds_param_t;

int gds_query_param(gds_param_t param, int *value);

enum gds_create_qp_flags {
GDS_CREATE_QP_DEFAULT = 0,
GDS_CREATE_QP_WQ_ON_GPU = 1<<0,
GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1,
GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2,
GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5,
GDS_CREATE_QP_DEFAULT = 0,
GDS_CREATE_QP_WQ_ON_GPU = 1<<0,
GDS_CREATE_QP_TX_CQ_ON_GPU = 1<<1,
GDS_CREATE_QP_RX_CQ_ON_GPU = 1<<2,
GDS_CREATE_QP_WQ_DBREC_ON_GPU = 1<<5,
};

typedef struct ibv_exp_qp_init_attr gds_qp_init_attr_t;
typedef struct ibv_exp_send_wr gds_send_wr;
typedef struct ibv_qp_init_attr gds_qp_init_attr_t;
typedef struct ibv_send_wr gds_send_wr;

struct gds_cq {
typedef struct gds_cq {
struct ibv_cq *cq;
uint32_t curr_offset;
};
} gds_cq_t;

struct gds_qp {
typedef struct gds_qp {
struct ibv_qp *qp;
struct gds_cq send_cq;
struct gds_cq recv_cq;
struct ibv_exp_res_domain * res_domain;
struct gds_cq *send_cq;
struct gds_cq *recv_cq;
struct ibv_context *dev_context;
};
} gds_qp_t;

/* \brief: Create a peer-enabled QP attached to the specified GPU id.
*
Expand Down Expand Up @@ -153,9 +152,11 @@ enum {
*/

typedef struct gds_send_request {
struct ibv_exp_peer_commit commit;
struct peer_op_wr wr[GDS_SEND_INFO_MAX_OPS];
uint8_t reserved0[32];
uint8_t reserved1[56 * GDS_SEND_INFO_MAX_OPS];
uint8_t pad0[32];
} gds_send_request_t;
static_assert(sizeof(gds_send_request_t) % 64 == 0, "gds_send_request_t must be 64-byte aligned.");

int gds_prepare_send(struct gds_qp *qp, gds_send_wr *p_ewr, gds_send_wr **bad_ewr, gds_send_request_t *request);
int gds_stream_post_send(CUstream stream, gds_send_request_t *request);
Expand All @@ -167,9 +168,11 @@ int gds_stream_post_send_all(CUstream stream, int count, gds_send_request_t *req
*/

typedef struct gds_wait_request {
struct ibv_exp_peer_peek peek;
struct peer_op_wr wr[GDS_WAIT_INFO_MAX_OPS];
uint8_t reserved0[40];
uint8_t reserved1[56 * GDS_WAIT_INFO_MAX_OPS];
uint8_t pad0[24];
} gds_wait_request_t;
static_assert(sizeof(gds_wait_request_t) % 64 == 0, "gds_wait_request_t must be 64-byte aligned.");

/**
* Initializes a wait request out of the next heading CQE, which is kept in
Expand Down
Loading