-
Notifications
You must be signed in to change notification settings - Fork 10.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add NVIDIA cuBLAS support #1044
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -142,10 +142,46 @@ inline static void* ggml_aligned_malloc(size_t size) { | |
} \ | ||
} while (0) | ||
|
||
#ifdef GGML_USE_ACCELERATE | ||
#if defined(GGML_USE_ACCELERATE) | ||
#include <Accelerate/Accelerate.h> | ||
#elif GGML_USE_OPENBLAS | ||
#elif defined(GGML_USE_OPENBLAS) | ||
#include <cblas.h> | ||
#elif defined(GGML_USE_CUBLAS) | ||
#include <cublas_v2.h> | ||
#include <cuda_runtime.h> | ||
#define CUDA_CHECK(err) \ | ||
do { \ | ||
cudaError_t err_ = (err); \ | ||
if (err_ != cudaSuccess) { \ | ||
printf("CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ | ||
cudaGetErrorString(err_)); \ | ||
exit(1); \ | ||
} \ | ||
} while (0) | ||
|
||
#define CUBLAS_CHECK(err) \ | ||
do { \ | ||
cublasStatus_t err_ = (err); \ | ||
if (err_ != CUBLAS_STATUS_SUCCESS) { \ | ||
printf("cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ | ||
exit(1); \ | ||
} \ | ||
} while (0) | ||
|
||
static cublasHandle_t cublasH = NULL; | ||
static cudaStream_t cudaStream = NULL; | ||
static void init_cublas(void) { | ||
if (cublasH == NULL) { | ||
// create cublas handle, bind a stream | ||
CUBLAS_CHECK(cublasCreate(&cublasH)); | ||
|
||
CUDA_CHECK(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking)); | ||
CUBLAS_CHECK(cublasSetStream(cublasH, cudaStream)); | ||
|
||
// configure logging to stdout | ||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); | ||
} | ||
} | ||
#endif | ||
|
||
#undef MIN | ||
|
@@ -3605,6 +3641,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { | |
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); | ||
} | ||
|
||
// initialize cuBLAS | ||
#if defined(GGML_USE_CUBLAS) | ||
init_cublas(); | ||
#endif | ||
|
||
is_first_call = false; | ||
} | ||
|
||
|
@@ -7161,7 +7202,7 @@ static void ggml_compute_forward_rms_norm( | |
|
||
// ggml_compute_forward_mul_mat | ||
|
||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
// helper function to determine if it is better to use BLAS or not | ||
// for large matrices, BLAS is faster | ||
static bool ggml_compute_forward_mul_mat_use_blas( | ||
|
@@ -7201,7 +7242,7 @@ static void ggml_compute_forward_mul_mat_f32( | |
const int64_t ne02 = src0->ne[2]; | ||
const int64_t ne03 = src0->ne[3]; | ||
|
||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
const int64_t ne10 = src1->ne[0]; | ||
#endif | ||
const int64_t ne11 = src1->ne[1]; | ||
|
@@ -7258,7 +7299,7 @@ static void ggml_compute_forward_mul_mat_f32( | |
// nb01 >= nb00 - src0 is not transposed | ||
// compute by src0 rows | ||
|
||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { | ||
if (params->ith != 0) { | ||
return; | ||
|
@@ -7272,22 +7313,59 @@ static void ggml_compute_forward_mul_mat_f32( | |
return; | ||
} | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
float *d_X = NULL; | ||
float *d_Y = NULL; | ||
float *d_D = NULL; | ||
const float alpha = 1.0f; | ||
const float beta = 0.0f; | ||
const int x_ne = ne01 * ne10; | ||
const int y_ne = ne11 * ne10; | ||
const int d_ne = ne11 * ne01; | ||
|
||
CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne)); | ||
CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne)); | ||
CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); | ||
#endif | ||
|
||
for (int64_t i03 = 0; i03 < ne03; i03++) { | ||
for (int64_t i02 = 0; i02 < ne02; i02++) { | ||
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); | ||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); | ||
|
||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
// copy data to device | ||
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream)); | ||
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream)); | ||
|
||
// compute | ||
CUBLAS_CHECK( | ||
cublasSgemm(cublasH, CUBLAS_OP_T, CUBLAS_OP_N, | ||
ne01, ne11, ne10, | ||
&alpha, d_X, ne00, | ||
d_Y, ne10, | ||
&beta, d_D, ne01)); | ||
|
||
// copy data to host | ||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); | ||
CUDA_CHECK(cudaStreamSynchronize(cudaStream)); | ||
#else | ||
// zT = y * xT | ||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, | ||
ne11, ne01, ne10, | ||
1.0f, y, ne10, | ||
x, ne00, | ||
0.0f, d, ne01); | ||
#endif | ||
} | ||
} | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
CUDA_CHECK(cudaFree(d_X)); | ||
CUDA_CHECK(cudaFree(d_Y)); | ||
CUDA_CHECK(cudaFree(d_D)); | ||
#endif | ||
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); | ||
|
||
return; | ||
|
@@ -7417,7 +7495,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |
// nb01 >= nb00 - src0 is not transposed | ||
// compute by src0 rows | ||
|
||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { | ||
GGML_ASSERT(nb10 == sizeof(float)); | ||
|
||
|
@@ -7433,10 +7511,37 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |
return; | ||
} | ||
|
||
float * const wdata = params->wdata; | ||
#if defined(GGML_USE_CUBLAS) | ||
ggml_fp16_t * const wdata = params->wdata; | ||
|
||
float *d_X = NULL; | ||
float *d_Y = NULL; | ||
float *d_D = NULL; | ||
const float alpha = 1.0f; | ||
const float beta = 0.0f; | ||
const int x_ne = ne01 * ne10; | ||
const int y_ne = ne11 * ne10; | ||
const int d_ne = ne11 * ne01; | ||
|
||
CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(ggml_fp16_t) * x_ne)); | ||
CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne)); | ||
CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); | ||
#else | ||
float * const wdata = params->wdata; | ||
#endif | ||
for (int64_t i03 = 0; i03 < ne03; i03++) { | ||
for (int64_t i02 = 0; i02 < ne02; i02++) { | ||
#if defined(GGML_USE_CUBLAS) | ||
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16 | ||
{ | ||
size_t id = 0; | ||
for (int64_t i01 = 0; i01 < ne11; ++i01) { | ||
for (int64_t i00 = 0; i00 < ne10; ++i00) { | ||
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)); | ||
} | ||
} | ||
} | ||
#else | ||
{ | ||
size_t id = 0; | ||
for (int64_t i01 = 0; i01 < ne01; ++i01) { | ||
|
@@ -7445,7 +7550,32 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |
} | ||
} | ||
} | ||
#endif | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03); | ||
const ggml_fp16_t * y = (ggml_fp16_t *) wdata; | ||
|
||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | ||
|
||
// copy data to device | ||
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, cudaStream)); | ||
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, cudaStream)); | ||
|
||
// compute | ||
CUBLAS_CHECK( | ||
cublasGemmEx(cublasH, CUBLAS_OP_T, CUBLAS_OP_N, | ||
ne01, ne11, ne10, | ||
&alpha, d_X, CUDA_R_16F, ne00, | ||
d_Y, CUDA_R_16F, ne10, | ||
&beta, d_D, CUDA_R_32F, ne01, | ||
CUBLAS_COMPUTE_32F, | ||
CUBLAS_GEMM_DEFAULT)); | ||
|
||
// copy data to host | ||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); | ||
CUDA_CHECK(cudaStreamSynchronize(cudaStream)); | ||
#else | ||
const float * x = wdata; | ||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); | ||
|
||
|
@@ -7457,9 +7587,15 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |
1.0f, y, ne10, | ||
x, ne00, | ||
0.0f, d, ne01); | ||
#endif | ||
} | ||
} | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
CUDA_CHECK(cudaFree(d_X)); | ||
CUDA_CHECK(cudaFree(d_Y)); | ||
CUDA_CHECK(cudaFree(d_D)); | ||
#endif | ||
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ | ||
|
||
return; | ||
|
@@ -7611,7 +7747,7 @@ static void ggml_compute_forward_mul_mat_q_f32( | |
// nb01 >= nb00 - src0 is not transposed | ||
// compute by src0 rows | ||
|
||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { | ||
if (params->ith != 0) { | ||
return; | ||
|
@@ -7628,6 +7764,21 @@ static void ggml_compute_forward_mul_mat_q_f32( | |
float * const wdata = params->wdata; | ||
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
float *d_X = NULL; | ||
float *d_Y = NULL; | ||
float *d_D = NULL; | ||
const float alpha = 1.0f; | ||
const float beta = 0.0f; | ||
const int x_ne = ne01 * ne10; | ||
const int y_ne = ne11 * ne10; | ||
const int d_ne = ne11 * ne01; | ||
|
||
CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne)); | ||
CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne)); | ||
CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); | ||
#endif | ||
|
||
for (int64_t i03 = 0; i03 < ne03; i03++) { | ||
for (int64_t i02 = 0; i02 < ne02; i02++) { | ||
{ | ||
|
@@ -7643,15 +7794,38 @@ static void ggml_compute_forward_mul_mat_q_f32( | |
|
||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
// copy data to device | ||
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream)); | ||
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream)); | ||
|
||
// compute | ||
CUBLAS_CHECK( | ||
cublasSgemm(cublasH, CUBLAS_OP_T, CUBLAS_OP_N, | ||
ne01, ne11, ne10, | ||
&alpha, d_X, ne00, | ||
d_Y, ne10, | ||
&beta, d_D, ne01)); | ||
|
||
// copy data to host | ||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); | ||
CUDA_CHECK(cudaStreamSynchronize(cudaStream)); | ||
#else | ||
// zT = y * xT | ||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, | ||
ne11, ne01, ne10, | ||
1.0f, y, ne10, | ||
x, ne00, | ||
0.0f, d, ne01); | ||
#endif | ||
} | ||
} | ||
|
||
#if defined(GGML_USE_CUBLAS) | ||
CUDA_CHECK(cudaFree(d_X)); | ||
CUDA_CHECK(cudaFree(d_Y)); | ||
CUDA_CHECK(cudaFree(d_D)); | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not add cuda quantize row below as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not used in cuBLAS. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, my bad, we do not need to quantize the out tensor nor the weight matrix. |
||
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); | ||
|
||
return; | ||
|
@@ -10466,7 +10640,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |
size_t cur = 0; | ||
|
||
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { | ||
node->n_tasks = 1; // TODO: this actually is doing nothing | ||
// the threads are still spinning | ||
|
@@ -10483,7 +10657,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { | ||
cur = 0; | ||
} else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) { | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { | ||
node->n_tasks = 1; | ||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); | ||
|
@@ -11800,7 +11974,15 @@ int ggml_cpu_has_wasm_simd(void) { | |
} | ||
|
||
int ggml_cpu_has_blas(void) { | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) | ||
return 1; | ||
#else | ||
return 0; | ||
#endif | ||
} | ||
|
||
int ggml_cpu_has_cublas(void) { | ||
#if defined(GGML_USE_CUBLAS) | ||
return 1; | ||
#else | ||
return 0; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pthread is added above depending on os.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wait, do we actually ever link against pthread? why is it only a compile flag?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I understand it is a dependency of cuda, so it is required to build with cublas.