Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate Modified AC-SpGEMM / GALATIC #26

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
8 changes: 4 additions & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[submodule "ext/moderngpu"]
path = ext/moderngpu
url = https://ctcyang@github.com/ctcyang/moderngpu.git
[submodule "ext/cub"]
path = ext/cub
url = https://ctcyang@github.com/NVlabs/cub.git
url = git@github.com:ctcyang/moderngpu.git
Copy link
Collaborator

@ctcyang ctcyang Jun 30, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Kind of a personal preference thing, but could you change these to use HTTPS verification rather than SSH? I usually don't have ssh-key installed, so I typically use HTTPS.

I've tried with HTTPS and for people with access to GALATIC private repo, then so long as that they enter my Github login + pw, it lets them pull in the submodule.

[submodule "ext/GALATIC"]
path = ext/GALATIC
url = git@github.com:richardlett/GALATIC.git
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,22 @@ set( PROJ_PATH ${CMAKE_SOURCE_DIR})
set( PROJ_OUT_PATH ${CMAKE_BINARY_DIR})
set( PROJ_HEADERS "" )
set( PROJ_LIBRARIES "" )
set( PROJ_INCLUDES "./" "ext/moderngpu/include" "ext/cub/cub")
set( PROJ_INCLUDES "./" "ext/moderngpu/include" "ext")
set( mgpu_SRC_FILES "ext/moderngpu/src/mgpucontext.cu" "ext/moderngpu/src/mgpuutil.cpp")
set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../bin )
#set( CUDA_CURAND_LIBRARY "$ENV{CUDA_HOME}/lib64/libcurand.so" )
#set( CUDA_CUBLAS_LIBRARY "$ENV{CUDA_HOME}/lib64/libcublas.so" )
set( CUDA_CUSPARSE_LIBRARY "$ENV{CUDA_HOME}/lib64/libcusparse.so" )
#FILE( GLOB_RECURSE PROJ_SOURCES graphblas/*.cu ../graphblas/*.cpp )
#FILE( G LOB_RECURSE PROJ_SOURCES graphblas/*.cu ../graphblas/*.cpp )
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: extra tab here

#FILE( GLOB_RECURSE PROJ_LIBRARIES ext/cublas1.1/*.cu )
FILE( GLOB_RECURSE PROJ_HEADERS graphblas/*.hpp)
# nvcc flags
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr ")
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-fpermissive;-arch=sm_35;-lineinfo;-Xptxas=-v;-dlcm=ca;-maxrregcount=64)
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_21)
# needed for cudamalloc
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11" )
set(CMAKE_CXX_FLAGS "-fpermissive -g -std=c++14" )
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: one space instead of two between -g and -std=c++14

#set(CMAKE_CXX_FLAGS "-fpermissive -pg -m64 -std=c++11" )
#set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11 -H" )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include common.mk
#-------------------------------------------------------------------------------

# Includes
INC += -I$(MGPU_DIR) -I$(CUB_DIR) -I$(BOOST_DIR) -I$(GRB_DIR)
INC += -I$(MGPU_DIR) -I$(BOOST_DIR) -I$(GRB_DIR)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: one space instead of two between MGPU_DIR and BOOST_DIR


#-------------------------------------------------------------------------------
# Dependency Lists
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ GraphBLAST is a GPU implementation of [GraphBLAS](http://www.graphblas.org), an

## Prerequisites

This software has been tested on the following dependencies:
This software has been tested to build with the following dependencies:

* CUDA 9.1, 9.2
* Boost 1.58
* g++ 4.9.3, 5.4.0
* CUDA 11.3
* (Change: CUDA > 11 is now required)
* Boost 1.74
* g++ 8.3.0
* (Change: C++14 is required)

Optional:

Expand Down
1 change: 1 addition & 0 deletions ext/GALATIC
Submodule GALATIC added at e82d65
1 change: 0 additions & 1 deletion ext/cub
Submodule cub deleted from d62284
8 changes: 8 additions & 0 deletions graphblas/backend/cuda/descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,14 @@ Info Descriptor::loadArgs(const po::variables_map& vm) {
std::cout << "Error: incorrect nthread selection!\n";
}

if(mode_ == "galatic") {
CHECK(set(GrB_MODE, GrB_GALATIC));
} else if (mode_ == "cusparse2") {
CHECK(set(GrB_MODE, GrB_CUSPARSE2));
} else {
std::cout << R"(Invalid mode: Options are "galatic" and "cusparse2")" << std::endl;
}

// TODO(@ctcyang): Enable device selection using ndevice_
// if( ndevice_!=0 )

Expand Down
27 changes: 23 additions & 4 deletions graphblas/backend/cuda/operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,30 @@ Info mxm(Matrix<c>* C,
desc));
} else if (typeid(c) == typeid(float) && typeid(a) == typeid(float) &&
typeid(b) == typeid(float)) {
CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_,
&B->sparse_, desc));

Desc_value s_mode;
CHECK(desc->get(GrB_MODE, &s_mode));

if (s_mode == GrB_CUSPARSE2)
CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_,
&B->sparse_, desc));
else {
if (s_mode != GrB_GALATIC) {
std::cout << R"(Unknown mode (Options are: "cuspare2" and "galatic"; defaulting to galatic)" << std::endl;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: cusparse2

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤦 wow

}
CHECK(GALATIC_spgemm(&C->sparse_,
op,
&A->sparse_,
&B->sparse_,
desc));

}
} else {
std::cout << "Error: Unmasked SpGEMM not implemented yet!\n";
return GrB_NOT_IMPLEMENTED;
CHECK(GALATIC_spgemm(&C->sparse_,
op,
&A->sparse_,
&B->sparse_,
desc));
}
} else {
std::cout << "Error: SpMM and GEMM not implemented yet!\n";
Expand Down
2 changes: 1 addition & 1 deletion graphblas/backend/cuda/reduce.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_
#define GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_

#include <cub.cuh>
#include <cub/cub.cuh>

#include <iostream>

Expand Down
178 changes: 178 additions & 0 deletions graphblas/backend/cuda/spgemm.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_
#define GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_

#include <GALATIC/GALATICMinimumIncludes.cuh>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mind changing this to #include "GALATICMinimumIncludes.cuh"? According to the Google C++ style guide, typically nonsystem headers use the quotation marks instead of angled brackets.

That way, the PROJ_INCLUDES can be set( PROJ_INCLUDES "./" "ext/moderngpu/include" "ext/GALATIC"), so it's easier for someone trying to install graphblast to debug for example when PROJ_INCLUDES can't find the GALATIC folder.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops, bad inference made me think that was for files that were resolved based on relative path to current file -I!


#include "graphblas/backend/cuda/sparse_matrix.hpp"

#include <cuda.h>
Expand Down Expand Up @@ -108,6 +110,182 @@ Info spgemmMasked(SparseMatrix<c>* C,
C->csc_initialized_ = false;
return GrB_SUCCESS;
}
// Shallow copy graphblast sparsematrix -> Galatic dCSR format
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: please add a newline between following template function and above function.

template<typename T>
static void matrixToGalatic(const SparseMatrix<T> *input , dCSR<T>& output) {
output.col_ids = reinterpret_cast<unsigned int*>(input->d_csrColInd_);
output.data = input->d_csrVal_;
output.row_offsets = reinterpret_cast<unsigned int*>(input->d_csrRowPtr_);
output.rows = input->nrows_;
output.cols = input->ncols_;
output.nnz = input->nvals_;
}

// Shallow copy Galatic dCSR format -> graphblast sparsematrix
template<typename T>
static void galaticToSparse(SparseMatrix<T> *output , const dCSR<T>& input) {
output->d_csrColInd_ = reinterpret_cast<Index*>(input.col_ids);
output->d_csrVal_ = input.data;
output->d_csrRowPtr_ = reinterpret_cast<Index*>(input.row_offsets);
output->nvals_ = input.nnz;
output->ncapacity_ = input.nnz;
}

// Nullize pointers in Galatic's sparse matrices;
// Galatic's destructors check for null. Doing this will prevent double
// freeing when shallowcopying with matrixToGalatic & galaticToSparse
template<typename T>
static void nullizeGalaticMatrix(dCSR<T>& m) {
m.data = nullptr;
m.col_ids = nullptr;
m.row_offsets = nullptr;
}

// A generic shim between graphblast's and GALATIC's semiring interfaces
template<typename NativeSR, typename a, typename b, typename c>
struct GalaticSemiring : SemiRing<a, b, c> {
NativeSR nativeSemiring;

__device__ c multiply(const a& left, const b& right) const
{ return nativeSemiring.mul_op(left, right); }
__device__ c add(const c& left,const c& right) const
{ return nativeSemiring.add_op(left, right); }
__device__ static c AdditiveIdentity()
{ return NativeSR::identity(); }
};

template <typename c, typename a, typename b, typename SemiringT>
Info GALATIC_spgemm(SparseMatrix<c>* C,
SemiringT op,
const SparseMatrix<a>* A,
const SparseMatrix<b>* B,
Descriptor* desc) {

Index A_nrows, A_ncols, A_nvals;
Index B_nrows, B_ncols, B_nvals;
Index C_nrows, C_ncols, C_nvals;

A_nrows = A->nrows_;
A_ncols = A->ncols_;
A_nvals = A->nvals_;
B_nrows = B->nrows_;
B_ncols = B->ncols_;
B_nvals = B->nvals_;
C_nrows = C->nrows_;
C_ncols = C->ncols_;

// Dimension compatibility check
if ((A_ncols != B_nrows) || (C_ncols != B_ncols) || (C_nrows != A_nrows)) {
std::cout << "Dim mismatch mxm" << std::endl;
std::cout << A_ncols << " " << B_nrows << std::endl;
std::cout << C_ncols << " " << B_ncols << std::endl;
std::cout << C_nrows << " " << A_nrows << std::endl;
return GrB_DIMENSION_MISMATCH;
}

if (C->d_csrColInd_ != NULL) {
CUDA_CALL(cudaFree(C->d_csrColInd_));
CUDA_CALL(cudaFree(C->d_csrVal_));
C->d_csrColInd_ = NULL;
C->d_csrVal_ = NULL;
}

if (C->d_csrRowPtr_ != NULL) {
CUDA_CALL(cudaFree(C->d_csrRowPtr_));
C->d_csrRowPtr_ = NULL;
}

if (C->h_csrColInd_ != NULL) {
free(C->h_csrColInd_);
free(C->h_csrVal_);
C->h_csrColInd_ = NULL;
C->h_csrVal_ = NULL;
}

dCSR<c> outMatrixGPU;
dCSR<a> leftInputMatrixGPU;
dCSR<b> rightInputMatrixGPU;

//shallow copy input matrices to galatic format
matrixToGalatic(A, leftInputMatrixGPU);
matrixToGalatic(B, rightInputMatrixGPU);

GPUMatrixMatrixMultiplyTraits DefaultTraits;

// GALATIC has its own semiring interface;
// GalaticSemiring is a shim here for conversion of graphblast-style
// SemiringT type. GalaticSemiring definition is above this function
GalaticSemiring<SemiringT, a, b, c> semiring_shim;
semiring_shim.nativeSemiring = op;

ExecutionStats stats;
try {
Desc_value nt_mode;
CHECK(desc->get(GrB_NT, &nt_mode));
const int num_threads = static_cast<int>(nt_mode);

switch (num_threads) {
case 64:
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
64, 4, 2, 8, 4, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
(leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
case 128:
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
128, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
( leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
case 512:
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
512, 1, 1, 1, 2, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
(leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
default: // 256
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
256, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
(leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
}
} catch(std::exception& e) {
std::cerr
<< "Exception occured in GALATIC SpGEMM, called from GALATIC_spgemm\n"
<< "Exception:\n"
<< e.what()
<< std::endl;
return GrB_OUT_OF_MEMORY; //the most likely issue, fixme
}

// shallow copy to native format.
galaticToSparse(C , outMatrixGPU);

// prevent allocations being freed twice when destructors are ran,
// as we are doing shallow copies:
//
// A, B -> leftInputMatrixGPU, rightInputMatrixGPU
// outMatrixGPU -> C.
nullizeGalaticMatrix(outMatrixGPU);
nullizeGalaticMatrix(leftInputMatrixGPU);
nullizeGalaticMatrix(rightInputMatrixGPU);

if (C->h_csrRowPtr_ == NULL)
C->h_csrRowPtr_ = reinterpret_cast<Index*>(malloc((A_nrows+1)*
sizeof(Index)));
C->h_csrColInd_ = reinterpret_cast<Index*>(malloc(C->ncapacity_*sizeof(Index)));
C->h_csrVal_ = reinterpret_cast<c*>(malloc(C->ncapacity_*sizeof(c)));

C->need_update_ = true; // Set flag that we need to copy data from GPU
C->csr_initialized_ = true;
C->csc_initialized_ = false;
return GrB_SUCCESS;
}

template <typename c, typename a, typename b, typename m,
typename BinaryOpT, typename SemiringT>
Expand Down
2 changes: 1 addition & 1 deletion graphblas/backend/cuda/spmspv_inner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <cusparse.h>

#include <moderngpu.cuh>
#include <cub.cuh>
#include <cub/cub.cuh>

#include <iostream>
#include <algorithm>
Expand Down
2 changes: 1 addition & 1 deletion graphblas/backend/cuda/spmv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <cusparse.h>

#include <moderngpu.cuh>
#include <cub.cuh>
#include <cub/cub.cuh>

#include <iostream>
#include <string>
Expand Down
35 changes: 16 additions & 19 deletions graphblas/stddef.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,21 +73,20 @@ struct less_equal {
return lhs <= rhs;
}
};

template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1>
struct first {
struct left_arg {
inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) {
return lhs;
}
};

template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1>
struct second {
struct right_arg {
inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) {
return rhs;
}
};


template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1>
struct minimum {
inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) {
Expand Down Expand Up @@ -173,21 +172,19 @@ REGISTER_MONOID(NotEqualToMonoid, not_equal_to, std::numeric_limits<T_out>::max(
} // namespace graphblas

// Semiring generator macro provided by Scott McMillan
#define REGISTER_SEMIRING(SR_NAME, ADD_MONOID, MULT_BINARYOP) \
template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1> \
struct SR_NAME \
{ \
typedef T_out result_type; \
typedef T_out T_out_type; \
\
inline T_out identity() const \
{ return ADD_MONOID<T_out>().identity(); } \
\
inline __host__ __device__ T_out add_op(T_out lhs, T_out rhs) \
{ return ADD_MONOID<T_out>()(lhs, rhs); } \
\
inline __host__ __device__ T_out mul_op(T_in1 lhs, T_in2 rhs) \
{ return MULT_BINARYOP<T_in1, T_in2, T_out>()(lhs, rhs); } \
#define REGISTER_SEMIRING(SR_NAME, ADD_MONOID, MULT_BINARYOP) \
template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1> \
struct SR_NAME \
{ \
typedef T_out result_type; \
typedef T_out T_out_type; \
\
static inline GRB_HOST_DEVICE T_out identity() \
{ return ADD_MONOID<T_out>().identity(); } \
inline GRB_HOST_DEVICE T_out add_op(const T_out& lhs, const T_out& rhs) const \
{ return ADD_MONOID<T_out>()(lhs, rhs); } \
inline GRB_HOST_DEVICE T_out mul_op(const T_in1& lhs, const T_in2& rhs) const \
{ return MULT_BINARYOP<T_in1, T_in2, T_out>()(lhs, rhs); } \
};

namespace graphblas {
Expand Down
Loading