Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NVIDIA cuBLAS support #1044

Merged
merged 3 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -142,6 +143,26 @@ if (LLAMA_OPENBLAS)
endif()
endif()

if (LLAMA_CUBLAS)
cmake_minimum_required(VERSION 3.17)

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")

add_compile_definitions(GGML_USE_CUBLAS)

if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

else()
message(WARNING "cuBLAS not found")
endif()
endif()

if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pthread is added above depending on os.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, do we actually ever link against pthread? why is it only a compile flag?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understand it is a dependency of cuda, so it is required to build with cublas.

endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
Expand Down
206 changes: 194 additions & 12 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,46 @@ inline static void* ggml_aligned_malloc(size_t size) {
} \
} while (0)

#ifdef GGML_USE_ACCELERATE
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
#elif GGML_USE_OPENBLAS
#elif defined(GGML_USE_OPENBLAS)
#include <cblas.h>
#elif defined(GGML_USE_CUBLAS)
#include <cublas_v2.h>
#include <cuda_runtime.h>
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
printf("CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \
exit(1); \
} \
} while (0)

#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
printf("cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)

static cublasHandle_t cublasH = NULL;
static cudaStream_t cudaStream = NULL;
static void init_cublas(void) {
if (cublasH == NULL) {
// create cublas handle, bind a stream
CUBLAS_CHECK(cublasCreate(&cublasH));

CUDA_CHECK(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(cublasH, cudaStream));

// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
}
}
#endif

#undef MIN
Expand Down Expand Up @@ -3605,6 +3641,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}

// initialize cuBLAS
#if defined(GGML_USE_CUBLAS)
init_cublas();
#endif

is_first_call = false;
}

Expand Down Expand Up @@ -7161,7 +7202,7 @@ static void ggml_compute_forward_rms_norm(

// ggml_compute_forward_mul_mat

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(
Expand Down Expand Up @@ -7201,7 +7242,7 @@ static void ggml_compute_forward_mul_mat_f32(
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
const int64_t ne10 = src1->ne[0];
#endif
const int64_t ne11 = src1->ne[1];
Expand Down Expand Up @@ -7258,7 +7299,7 @@ static void ggml_compute_forward_mul_mat_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
Expand All @@ -7272,22 +7313,59 @@ static void ggml_compute_forward_mul_mat_f32(
return;
}

#if defined(GGML_USE_CUBLAS)
float *d_X = NULL;
float *d_Y = NULL;
float *d_D = NULL;
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne10;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;

CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne));
#endif

for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);

float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

#if defined(GGML_USE_CUBLAS)
// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream));
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream));

// compute
CUBLAS_CHECK(
cublasSgemm(cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, d_X, ne00,
d_Y, ne10,
&beta, d_D, ne01));

// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
#else
// zT = y * xT
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
#endif
}
}

#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaFree(d_X));
CUDA_CHECK(cudaFree(d_Y));
CUDA_CHECK(cudaFree(d_D));
#endif
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);

return;
Expand Down Expand Up @@ -7417,7 +7495,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
GGML_ASSERT(nb10 == sizeof(float));

Expand All @@ -7433,10 +7511,37 @@ static void ggml_compute_forward_mul_mat_f16_f32(
return;
}

float * const wdata = params->wdata;
#if defined(GGML_USE_CUBLAS)
ggml_fp16_t * const wdata = params->wdata;

float *d_X = NULL;
float *d_Y = NULL;
float *d_D = NULL;
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne10;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;

CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(ggml_fp16_t) * x_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne));
#else
float * const wdata = params->wdata;
#endif
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
#if defined(GGML_USE_CUBLAS)
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
{
size_t id = 0;
for (int64_t i01 = 0; i01 < ne11; ++i01) {
for (int64_t i00 = 0; i00 < ne10; ++i00) {
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
}
}
}
#else
{
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
Expand All @@ -7445,7 +7550,32 @@ static void ggml_compute_forward_mul_mat_f16_f32(
}
}
}
#endif

#if defined(GGML_USE_CUBLAS)
const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;

float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, cudaStream));
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, cudaStream));

// compute
CUBLAS_CHECK(
cublasGemmEx(cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, d_X, CUDA_R_16F, ne00,
d_Y, CUDA_R_16F, ne10,
&beta, d_D, CUDA_R_32F, ne01,
CUBLAS_COMPUTE_32F,
CUBLAS_GEMM_DEFAULT));

// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
#else
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);

Expand All @@ -7457,9 +7587,15 @@ static void ggml_compute_forward_mul_mat_f16_f32(
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
#endif
}
}

#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaFree(d_X));
CUDA_CHECK(cudaFree(d_Y));
CUDA_CHECK(cudaFree(d_D));
#endif
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/

return;
Expand Down Expand Up @@ -7611,7 +7747,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
Expand All @@ -7628,6 +7764,21 @@ static void ggml_compute_forward_mul_mat_q_f32(
float * const wdata = params->wdata;
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;

#if defined(GGML_USE_CUBLAS)
float *d_X = NULL;
float *d_Y = NULL;
float *d_D = NULL;
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne10;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;

CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne));
#endif

for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
{
Expand All @@ -7643,15 +7794,38 @@ static void ggml_compute_forward_mul_mat_q_f32(

float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

#if defined(GGML_USE_CUBLAS)
// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream));
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream));

// compute
CUBLAS_CHECK(
cublasSgemm(cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, d_X, ne00,
d_Y, ne10,
&beta, d_D, ne01));

// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
#else
// zT = y * xT
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
#endif
}
}

#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaFree(d_X));
CUDA_CHECK(cudaFree(d_Y));
CUDA_CHECK(cudaFree(d_D));
#endif
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not add cuda quantize row below as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not used in cuBLAS.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, my bad, we do not need to quantize the out tensor nor the weight matrix.

//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);

return;
Expand Down Expand Up @@ -10466,7 +10640,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
size_t cur = 0;

if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
Expand All @@ -10483,7 +10657,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
cur = 0;
} else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
Expand Down Expand Up @@ -11800,7 +11974,15 @@ int ggml_cpu_has_wasm_simd(void) {
}

int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
return 1;
#else
return 0;
#endif
}

int ggml_cpu_has_cublas(void) {
#if defined(GGML_USE_CUBLAS)
return 1;
#else
return 0;
Expand Down
Loading