diff --git a/.gitmodules b/.gitmodules index 792535b..b6ccde3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "ext/moderngpu"] path = ext/moderngpu url = https://ctcyang@github.com/ctcyang/moderngpu.git -[submodule "ext/cub"] - path = ext/cub - url = https://ctcyang@github.com/NVlabs/cub.git +[submodule "ext/GALATIC"] + path = ext/GALATIC + url = https://ctcyang@github.com/richardlett/GALATIC.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e7d002..e3ce001 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ set( PROJ_PATH ${CMAKE_SOURCE_DIR}) set( PROJ_OUT_PATH ${CMAKE_BINARY_DIR}) set( PROJ_HEADERS "" ) set( PROJ_LIBRARIES "" ) -set( PROJ_INCLUDES "./" "ext/moderngpu/include" "ext/cub/cub") +set( PROJ_INCLUDES "./" "ext/moderngpu/include" "ext/GALATIC") set( mgpu_SRC_FILES "ext/moderngpu/src/mgpucontext.cu" "ext/moderngpu/src/mgpuutil.cpp") set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../bin ) #set( CUDA_CURAND_LIBRARY "$ENV{CUDA_HOME}/lib64/libcurand.so" ) @@ -23,12 +23,12 @@ set( CUDA_CUSPARSE_LIBRARY "$ENV{CUDA_HOME}/lib64/libcusparse.so" ) #FILE( GLOB_RECURSE PROJ_LIBRARIES ext/cublas1.1/*.cu ) FILE( GLOB_RECURSE PROJ_HEADERS graphblas/*.hpp) # nvcc flags -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v") +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr ") #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-fpermissive;-arch=sm_35;-lineinfo;-Xptxas=-v;-dlcm=ca;-maxrregcount=64) #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_21) # needed for cudamalloc set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11" ) +set(CMAKE_CXX_FLAGS "-fpermissive -g -std=c++14" ) #set(CMAKE_CXX_FLAGS "-fpermissive -pg -m64 -std=c++11" ) #set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11 -H" ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") diff --git a/Makefile b/Makefile index b683a75..bf0e8e7 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ include common.mk #------------------------------------------------------------------------------- # Includes -INC += -I$(MGPU_DIR) -I$(CUB_DIR) -I$(BOOST_DIR) -I$(GRB_DIR) +INC += -I$(MGPU_DIR) -I$(BOOST_DIR) -I$(GRB_DIR) #------------------------------------------------------------------------------- # Dependency Lists diff --git a/README.md b/README.md index 46c81f1..f5eb211 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,13 @@ GraphBLAST is a GPU implementation of [GraphBLAS](http://www.graphblas.org), an ## Prerequisites -This software has been tested on the following dependencies: +This software has been tested to build with the following dependencies: -* CUDA 9.1, 9.2 -* Boost 1.58 -* g++ 4.9.3, 5.4.0 +* CUDA 11.3 + * (Change: CUDA > 11 is now required) +* Boost 1.74 +* g++ 8.3.0 + * (Change: C++14 is required) Optional: diff --git a/ext/GALATIC b/ext/GALATIC new file mode 160000 index 0000000..e82d65a --- /dev/null +++ b/ext/GALATIC @@ -0,0 +1 @@ +Subproject commit e82d65a99006f60e98330daa0424319c94b62bd7 diff --git a/ext/cub b/ext/cub deleted file mode 160000 index d622848..0000000 --- a/ext/cub +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d622848f9fb62f13e5e064e1deb43b6bcbb12bad diff --git a/graphblas/backend/cuda/descriptor.hpp b/graphblas/backend/cuda/descriptor.hpp index 9788143..0f1ae7d 100644 --- a/graphblas/backend/cuda/descriptor.hpp +++ b/graphblas/backend/cuda/descriptor.hpp @@ -280,6 +280,14 @@ Info Descriptor::loadArgs(const po::variables_map& vm) { std::cout << "Error: incorrect nthread selection!\n"; } + if(mode_ == "galatic") { + CHECK(set(GrB_MODE, GrB_GALATIC)); + } else if (mode_ == "cusparse2") { + CHECK(set(GrB_MODE, GrB_CUSPARSE2)); + } else { + std::cout << R"(Invalid mode: Options are "galatic" and "cusparse2")" << std::endl; + } + // TODO(@ctcyang): Enable device selection using ndevice_ // if( ndevice_!=0 ) diff --git a/graphblas/backend/cuda/operations.hpp b/graphblas/backend/cuda/operations.hpp index fdcb1ba..33af949 100644 --- a/graphblas/backend/cuda/operations.hpp +++ b/graphblas/backend/cuda/operations.hpp @@ -43,11 +43,30 @@ Info mxm(Matrix* C, desc)); } else if (typeid(c) == typeid(float) && typeid(a) == typeid(float) && typeid(b) == typeid(float)) { - CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_, - &B->sparse_, desc)); + + Desc_value s_mode; + CHECK(desc->get(GrB_MODE, &s_mode)); + + if (s_mode == GrB_CUSPARSE2) + CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_, + &B->sparse_, desc)); + else { + if (s_mode != GrB_GALATIC) { + std::cout << R"(Unknown mode (Options are: "cusparse2" and "galatic"; defaulting to galatic)" << std::endl; + } + CHECK(GALATIC_spgemm(&C->sparse_, + op, + &A->sparse_, + &B->sparse_, + desc)); + + } } else { - std::cout << "Error: Unmasked SpGEMM not implemented yet!\n"; - return GrB_NOT_IMPLEMENTED; + CHECK(GALATIC_spgemm(&C->sparse_, + op, + &A->sparse_, + &B->sparse_, + desc)); } } else { std::cout << "Error: SpMM and GEMM not implemented yet!\n"; diff --git a/graphblas/backend/cuda/reduce.hpp b/graphblas/backend/cuda/reduce.hpp index 3adbca4..5c90586 100644 --- a/graphblas/backend/cuda/reduce.hpp +++ b/graphblas/backend/cuda/reduce.hpp @@ -1,7 +1,7 @@ #ifndef GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_ #define GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_ -#include +#include #include diff --git a/graphblas/backend/cuda/spgemm.hpp b/graphblas/backend/cuda/spgemm.hpp index 20aae02..66e78e6 100644 --- a/graphblas/backend/cuda/spgemm.hpp +++ b/graphblas/backend/cuda/spgemm.hpp @@ -1,6 +1,8 @@ #ifndef GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_ #define GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_ +#include "GALATICMinimumIncludes.cuh" + #include "graphblas/backend/cuda/sparse_matrix.hpp" #include @@ -108,6 +110,182 @@ Info spgemmMasked(SparseMatrix* C, C->csc_initialized_ = false; return GrB_SUCCESS; } +// Shallow copy graphblast sparsematrix -> Galatic dCSR format +template +static void matrixToGalatic(const SparseMatrix *input , dCSR& output) { + output.col_ids = reinterpret_cast(input->d_csrColInd_); + output.data = input->d_csrVal_; + output.row_offsets = reinterpret_cast(input->d_csrRowPtr_); + output.rows = input->nrows_; + output.cols = input->ncols_; + output.nnz = input->nvals_; +} + +// Shallow copy Galatic dCSR format -> graphblast sparsematrix +template +static void galaticToSparse(SparseMatrix *output , const dCSR& input) { + output->d_csrColInd_ = reinterpret_cast(input.col_ids); + output->d_csrVal_ = input.data; + output->d_csrRowPtr_ = reinterpret_cast(input.row_offsets); + output->nvals_ = input.nnz; + output->ncapacity_ = input.nnz; +} + +// Nullize pointers in Galatic's sparse matrices; +// Galatic's destructors check for null. Doing this will prevent double +// freeing when shallowcopying with matrixToGalatic & galaticToSparse +template +static void nullizeGalaticMatrix(dCSR& m) { + m.data = nullptr; + m.col_ids = nullptr; + m.row_offsets = nullptr; +} + +// A generic shim between graphblast's and GALATIC's semiring interfaces +template +struct GalaticSemiring : SemiRing { + NativeSR nativeSemiring; + + __device__ c multiply(const a& left, const b& right) const + { return nativeSemiring.mul_op(left, right); } + __device__ c add(const c& left,const c& right) const + { return nativeSemiring.add_op(left, right); } + __device__ static c AdditiveIdentity() + { return NativeSR::identity(); } +}; + +template +Info GALATIC_spgemm(SparseMatrix* C, + SemiringT op, + const SparseMatrix* A, + const SparseMatrix* B, + Descriptor* desc) { + + Index A_nrows, A_ncols, A_nvals; + Index B_nrows, B_ncols, B_nvals; + Index C_nrows, C_ncols, C_nvals; + + A_nrows = A->nrows_; + A_ncols = A->ncols_; + A_nvals = A->nvals_; + B_nrows = B->nrows_; + B_ncols = B->ncols_; + B_nvals = B->nvals_; + C_nrows = C->nrows_; + C_ncols = C->ncols_; + + // Dimension compatibility check + if ((A_ncols != B_nrows) || (C_ncols != B_ncols) || (C_nrows != A_nrows)) { + std::cout << "Dim mismatch mxm" << std::endl; + std::cout << A_ncols << " " << B_nrows << std::endl; + std::cout << C_ncols << " " << B_ncols << std::endl; + std::cout << C_nrows << " " << A_nrows << std::endl; + return GrB_DIMENSION_MISMATCH; + } + + if (C->d_csrColInd_ != NULL) { + CUDA_CALL(cudaFree(C->d_csrColInd_)); + CUDA_CALL(cudaFree(C->d_csrVal_)); + C->d_csrColInd_ = NULL; + C->d_csrVal_ = NULL; + } + + if (C->d_csrRowPtr_ != NULL) { + CUDA_CALL(cudaFree(C->d_csrRowPtr_)); + C->d_csrRowPtr_ = NULL; + } + + if (C->h_csrColInd_ != NULL) { + free(C->h_csrColInd_); + free(C->h_csrVal_); + C->h_csrColInd_ = NULL; + C->h_csrVal_ = NULL; + } + + dCSR outMatrixGPU; + dCSR leftInputMatrixGPU; + dCSR rightInputMatrixGPU; + + //shallow copy input matrices to galatic format + matrixToGalatic(A, leftInputMatrixGPU); + matrixToGalatic(B, rightInputMatrixGPU); + + GPUMatrixMatrixMultiplyTraits DefaultTraits; + + // GALATIC has its own semiring interface; + // GalaticSemiring is a shim here for conversion of graphblast-style + // SemiringT type. GalaticSemiring definition is above this function + GalaticSemiring semiring_shim; + semiring_shim.nativeSemiring = op; + + ExecutionStats stats; + try { + Desc_value nt_mode; + CHECK(desc->get(GrB_NT, &nt_mode)); + const int num_threads = static_cast(nt_mode); + + switch (num_threads) { + case 64: + ACSpGEMM::MultiplyImplementation, + 64, 4, 2, 8, 4, 16, 512, 8, 0, a, b, c, + GalaticSemiring> + (leftInputMatrixGPU, rightInputMatrixGPU, + outMatrixGPU, DefaultTraits, stats, semiring_shim); + break; + case 128: + ACSpGEMM::MultiplyImplementation, + 128, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c, + GalaticSemiring> + ( leftInputMatrixGPU, rightInputMatrixGPU, + outMatrixGPU, DefaultTraits, stats, semiring_shim); + break; + case 512: + ACSpGEMM::MultiplyImplementation, + 512, 1, 1, 1, 2, 16, 512, 8, 0, a, b, c, + GalaticSemiring> + (leftInputMatrixGPU, rightInputMatrixGPU, + outMatrixGPU, DefaultTraits, stats, semiring_shim); + break; + default: // 256 + ACSpGEMM::MultiplyImplementation, + 256, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c, + GalaticSemiring> + (leftInputMatrixGPU, rightInputMatrixGPU, + outMatrixGPU, DefaultTraits, stats, semiring_shim); + break; + } + } catch(std::exception& e) { + std::cerr + << "Exception occured in GALATIC SpGEMM, called from GALATIC_spgemm\n" + << "Exception:\n" + << e.what() + << std::endl; + return GrB_OUT_OF_MEMORY; //the most likely issue, fixme + } + + // shallow copy to native format. + galaticToSparse(C , outMatrixGPU); + + // prevent allocations being freed twice when destructors are ran, + // as we are doing shallow copies: + // + // A, B -> leftInputMatrixGPU, rightInputMatrixGPU + // outMatrixGPU -> C. + nullizeGalaticMatrix(outMatrixGPU); + nullizeGalaticMatrix(leftInputMatrixGPU); + nullizeGalaticMatrix(rightInputMatrixGPU); + + if (C->h_csrRowPtr_ == NULL) + C->h_csrRowPtr_ = reinterpret_cast(malloc((A_nrows+1)* + sizeof(Index))); + C->h_csrColInd_ = reinterpret_cast(malloc(C->ncapacity_*sizeof(Index))); + C->h_csrVal_ = reinterpret_cast(malloc(C->ncapacity_*sizeof(c))); + + C->need_update_ = true; // Set flag that we need to copy data from GPU + C->csr_initialized_ = true; + C->csc_initialized_ = false; + return GrB_SUCCESS; +} template diff --git a/graphblas/backend/cuda/spmspv_inner.hpp b/graphblas/backend/cuda/spmspv_inner.hpp index 38bac9e..2704a6b 100644 --- a/graphblas/backend/cuda/spmspv_inner.hpp +++ b/graphblas/backend/cuda/spmspv_inner.hpp @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/graphblas/backend/cuda/spmv.hpp b/graphblas/backend/cuda/spmv.hpp index dd61f22..2d2077b 100644 --- a/graphblas/backend/cuda/spmv.hpp +++ b/graphblas/backend/cuda/spmv.hpp @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/graphblas/stddef.hpp b/graphblas/stddef.hpp index 1a12f21..815caa5 100644 --- a/graphblas/stddef.hpp +++ b/graphblas/stddef.hpp @@ -73,21 +73,20 @@ struct less_equal { return lhs <= rhs; } }; - template -struct first { +struct left_arg { inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) { return lhs; } }; - template -struct second { +struct right_arg { inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) { return rhs; } }; + template struct minimum { inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) { @@ -173,21 +172,19 @@ REGISTER_MONOID(NotEqualToMonoid, not_equal_to, std::numeric_limits::max( } // namespace graphblas // Semiring generator macro provided by Scott McMillan -#define REGISTER_SEMIRING(SR_NAME, ADD_MONOID, MULT_BINARYOP) \ -template \ -struct SR_NAME \ -{ \ - typedef T_out result_type; \ - typedef T_out T_out_type; \ - \ - inline T_out identity() const \ - { return ADD_MONOID().identity(); } \ - \ - inline __host__ __device__ T_out add_op(T_out lhs, T_out rhs) \ - { return ADD_MONOID()(lhs, rhs); } \ - \ - inline __host__ __device__ T_out mul_op(T_in1 lhs, T_in2 rhs) \ - { return MULT_BINARYOP()(lhs, rhs); } \ +#define REGISTER_SEMIRING(SR_NAME, ADD_MONOID, MULT_BINARYOP) \ +template \ +struct SR_NAME \ +{ \ + typedef T_out result_type; \ + typedef T_out T_out_type; \ + \ + static inline GRB_HOST_DEVICE T_out identity() \ + { return ADD_MONOID().identity(); } \ + inline GRB_HOST_DEVICE T_out add_op(const T_out& lhs, const T_out& rhs) const \ + { return ADD_MONOID()(lhs, rhs); } \ + inline GRB_HOST_DEVICE T_out mul_op(const T_in1& lhs, const T_in2& rhs) const \ + { return MULT_BINARYOP()(lhs, rhs); } \ }; namespace graphblas { diff --git a/graphblas/types.hpp b/graphblas/types.hpp index cf5c6ba..35a8892 100644 --- a/graphblas/types.hpp +++ b/graphblas/types.hpp @@ -60,6 +60,7 @@ enum Desc_value {GrB_SCMP, // for GrB_MASK GrB_DEFAULT, GrB_CUSPARSE, // for SpMV, SpMM GrB_CUSPARSE2, + GrB_GALATIC, GrB_FIXEDROW, GrB_FIXEDCOL, GrB_MERGEPATH = 9, diff --git a/test/gspgemm.cu b/test/gspgemm.cu index 9e08388..c7668a4 100644 --- a/test/gspgemm.cu +++ b/test/gspgemm.cu @@ -18,16 +18,18 @@ int main( int argc, char** argv ) std::vector a_row_indices, b_row_indices; std::vector a_col_indices, b_col_indices; - std::vector a_values, b_values; + std::vector a_values, b_values; graphblas::Index a_num_rows, a_num_cols, a_num_edges; graphblas::Index b_num_rows, b_num_cols, b_num_edges; char* dat_name; - // Load A + + + // Load A std::cout << "loading A" << std::endl; readMtx("../data/small/chesapeake.mtx", &a_row_indices, &a_col_indices, &a_values, &a_num_rows, &a_num_cols, &a_num_edges, 0, false, &dat_name); - graphblas::Matrix a(a_num_rows, a_num_cols); + graphblas::Matrix a(a_num_rows, a_num_cols); a.build(&a_row_indices, &a_col_indices, &a_values, a_num_edges, GrB_NULL, dat_name); if(DEBUG) a.print(); @@ -36,20 +38,26 @@ int main( int argc, char** argv ) std::cout << "loading B" << std::endl; readMtx("../data/small/chesapeake.mtx", &b_row_indices, &b_col_indices, &b_values, &b_num_rows, &b_num_cols, &b_num_edges, 0, false, &dat_name); - graphblas::Matrix b(b_num_rows, b_num_cols); + graphblas::Matrix b(b_num_rows, b_num_cols); b.build(&b_row_indices, &b_col_indices, &b_values, b_num_edges, GrB_NULL, dat_name); if(DEBUG) b.print(); // - graphblas::Matrix c(a_num_rows, b_num_cols); + graphblas::Matrix c(a_num_rows, b_num_cols); graphblas::Descriptor desc; - desc.descriptor_.debug_ = true; - graphblas::mxm( + + po::variables_map vm; + parseArgs(argc, argv, &vm); + CHECK(desc.loadArgs(vm)); + + + desc.descriptor_.debug_ = true; + graphblas::mxm( &c, GrB_NULL, GrB_NULL, - graphblas::PlusMultipliesSemiring(), + graphblas::PlusMultipliesSemiring(), &a, &b, &desc @@ -57,16 +65,17 @@ int main( int argc, char** argv ) if(DEBUG) c.print(); // Multiply using GPU array initialization. - graphblas::Matrix A(a_num_rows, a_num_cols); - graphblas::Matrix B(b_num_rows, b_num_cols); - graphblas::Matrix C(a_num_rows, b_num_cols); + graphblas::Matrix A(a_num_rows, a_num_cols); + graphblas::Matrix B(b_num_rows, b_num_cols); + graphblas::Matrix C(a_num_rows, b_num_cols); A.build(a.matrix_.sparse_.d_csrRowPtr_, a.matrix_.sparse_.d_csrColInd_, a.matrix_.sparse_.d_csrVal_, a.matrix_.sparse_.nvals_); B.build(b.matrix_.sparse_.d_csrRowPtr_, b.matrix_.sparse_.d_csrColInd_, b.matrix_.sparse_.d_csrVal_, b.matrix_.sparse_.nvals_); + desc.descriptor_.debug_ = true; - graphblas::mxm(&C, GrB_NULL, GrB_NULL, graphblas::PlusMultipliesSemiring(), + graphblas::mxm(&C, GrB_NULL, GrB_NULL, graphblas::CustomLessPlusSemiring(), &A, &B, &desc); // Multiply using CPU array initialization.