87
87
#define CC_OFFSET_AMD 1000000
88
88
#define CC_RDNA2 (CC_OFFSET_AMD + 1030 )
89
89
90
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
+ // - 7B quantum model: +100-200 MB
94
+ // - 13B quantum model: +200-400 MB
95
+ //
96
+ // #define GGML_CUDA_FORCE_MMQ
97
+
98
+ // TODO: improve this to be correct for more hardware
99
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
+ // probably other such cases, and not sure what happens on AMD hardware
101
+ #if !defined(GGML_CUDA_FORCE_MMQ)
102
+ #define CUDA_USE_TENSOR_CORES
103
+ #endif
104
+
105
+ // max batch size to use MMQ kernels when tensor cores are available
106
+ #define MMQ_MAX_BATCH_SIZE 32
107
+
90
108
#if defined(GGML_USE_HIPBLAS)
91
109
#define __CUDA_ARCH__ 1300
92
110
@@ -470,7 +488,6 @@ static int g_device_count = -1;
470
488
static int g_main_device = 0 ;
471
489
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
472
490
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
473
- static bool g_mul_mat_q = true ;
474
491
475
492
static void * g_scratch_buffer = nullptr ;
476
493
static size_t g_scratch_size = 0 ; // disabled by default
@@ -3554,9 +3571,15 @@ static __device__ __forceinline__ void mul_mat_q(
3554
3571
#define MMQ_X_Q4_0_RDNA1 64
3555
3572
#define MMQ_Y_Q4_0_RDNA1 64
3556
3573
#define NWARPS_Q4_0_RDNA1 8
3574
+ #if defined(CUDA_USE_TENSOR_CORES)
3575
+ #define MMQ_X_Q4_0_AMPERE 4
3576
+ #define MMQ_Y_Q4_0_AMPERE 32
3577
+ #define NWARPS_Q4_0_AMPERE 4
3578
+ #else
3557
3579
#define MMQ_X_Q4_0_AMPERE 64
3558
3580
#define MMQ_Y_Q4_0_AMPERE 128
3559
3581
#define NWARPS_Q4_0_AMPERE 4
3582
+ #endif
3560
3583
#define MMQ_X_Q4_0_PASCAL 64
3561
3584
#define MMQ_Y_Q4_0_PASCAL 64
3562
3585
#define NWARPS_Q4_0_PASCAL 8
@@ -3615,9 +3638,15 @@ template <bool need_check> static __global__ void
3615
3638
#define MMQ_X_Q4_1_RDNA1 64
3616
3639
#define MMQ_Y_Q4_1_RDNA1 64
3617
3640
#define NWARPS_Q4_1_RDNA1 8
3641
+ #if defined(CUDA_USE_TENSOR_CORES)
3642
+ #define MMQ_X_Q4_1_AMPERE 4
3643
+ #define MMQ_Y_Q4_1_AMPERE 32
3644
+ #define NWARPS_Q4_1_AMPERE 4
3645
+ #else
3618
3646
#define MMQ_X_Q4_1_AMPERE 64
3619
3647
#define MMQ_Y_Q4_1_AMPERE 128
3620
3648
#define NWARPS_Q4_1_AMPERE 4
3649
+ #endif
3621
3650
#define MMQ_X_Q4_1_PASCAL 64
3622
3651
#define MMQ_Y_Q4_1_PASCAL 64
3623
3652
#define NWARPS_Q4_1_PASCAL 8
@@ -3678,9 +3707,15 @@ template <bool need_check> static __global__ void
3678
3707
#define MMQ_X_Q5_0_RDNA1 64
3679
3708
#define MMQ_Y_Q5_0_RDNA1 64
3680
3709
#define NWARPS_Q5_0_RDNA1 8
3710
+ #if defined(CUDA_USE_TENSOR_CORES)
3711
+ #define MMQ_X_Q5_0_AMPERE 4
3712
+ #define MMQ_Y_Q5_0_AMPERE 32
3713
+ #define NWARPS_Q5_0_AMPERE 4
3714
+ #else
3681
3715
#define MMQ_X_Q5_0_AMPERE 128
3682
3716
#define MMQ_Y_Q5_0_AMPERE 64
3683
3717
#define NWARPS_Q5_0_AMPERE 4
3718
+ #endif
3684
3719
#define MMQ_X_Q5_0_PASCAL 64
3685
3720
#define MMQ_Y_Q5_0_PASCAL 64
3686
3721
#define NWARPS_Q5_0_PASCAL 8
@@ -3739,9 +3774,15 @@ template <bool need_check> static __global__ void
3739
3774
#define MMQ_X_Q5_1_RDNA1 64
3740
3775
#define MMQ_Y_Q5_1_RDNA1 64
3741
3776
#define NWARPS_Q5_1_RDNA1 8
3777
+ #if defined(CUDA_USE_TENSOR_CORES)
3778
+ #define MMQ_X_Q5_1_AMPERE 4
3779
+ #define MMQ_Y_Q5_1_AMPERE 32
3780
+ #define NWARPS_Q5_1_AMPERE 4
3781
+ #else
3742
3782
#define MMQ_X_Q5_1_AMPERE 128
3743
3783
#define MMQ_Y_Q5_1_AMPERE 64
3744
3784
#define NWARPS_Q5_1_AMPERE 4
3785
+ #endif
3745
3786
#define MMQ_X_Q5_1_PASCAL 64
3746
3787
#define MMQ_Y_Q5_1_PASCAL 64
3747
3788
#define NWARPS_Q5_1_PASCAL 8
@@ -3800,9 +3841,15 @@ mul_mat_q5_1(
3800
3841
#define MMQ_X_Q8_0_RDNA1 64
3801
3842
#define MMQ_Y_Q8_0_RDNA1 64
3802
3843
#define NWARPS_Q8_0_RDNA1 8
3844
+ #if defined(CUDA_USE_TENSOR_CORES)
3845
+ #define MMQ_X_Q8_0_AMPERE 4
3846
+ #define MMQ_Y_Q8_0_AMPERE 32
3847
+ #define NWARPS_Q8_0_AMPERE 4
3848
+ #else
3803
3849
#define MMQ_X_Q8_0_AMPERE 128
3804
3850
#define MMQ_Y_Q8_0_AMPERE 64
3805
3851
#define NWARPS_Q8_0_AMPERE 4
3852
+ #endif
3806
3853
#define MMQ_X_Q8_0_PASCAL 64
3807
3854
#define MMQ_Y_Q8_0_PASCAL 64
3808
3855
#define NWARPS_Q8_0_PASCAL 8
@@ -3861,9 +3908,15 @@ template <bool need_check> static __global__ void
3861
3908
#define MMQ_X_Q2_K_RDNA1 128
3862
3909
#define MMQ_Y_Q2_K_RDNA1 32
3863
3910
#define NWARPS_Q2_K_RDNA1 8
3911
+ #if defined(CUDA_USE_TENSOR_CORES)
3912
+ #define MMQ_X_Q2_K_AMPERE 4
3913
+ #define MMQ_Y_Q2_K_AMPERE 32
3914
+ #define NWARPS_Q2_K_AMPERE 4
3915
+ #else
3864
3916
#define MMQ_X_Q2_K_AMPERE 64
3865
3917
#define MMQ_Y_Q2_K_AMPERE 128
3866
3918
#define NWARPS_Q2_K_AMPERE 4
3919
+ #endif
3867
3920
#define MMQ_X_Q2_K_PASCAL 64
3868
3921
#define MMQ_Y_Q2_K_PASCAL 64
3869
3922
#define NWARPS_Q2_K_PASCAL 8
@@ -3922,9 +3975,15 @@ mul_mat_q2_K(
3922
3975
#define MMQ_X_Q3_K_RDNA1 32
3923
3976
#define MMQ_Y_Q3_K_RDNA1 128
3924
3977
#define NWARPS_Q3_K_RDNA1 8
3978
+ #if defined(CUDA_USE_TENSOR_CORES)
3979
+ #define MMQ_X_Q3_K_AMPERE 4
3980
+ #define MMQ_Y_Q3_K_AMPERE 32
3981
+ #define NWARPS_Q3_K_AMPERE 4
3982
+ #else
3925
3983
#define MMQ_X_Q3_K_AMPERE 128
3926
3984
#define MMQ_Y_Q3_K_AMPERE 128
3927
3985
#define NWARPS_Q3_K_AMPERE 4
3986
+ #endif
3928
3987
#define MMQ_X_Q3_K_PASCAL 64
3929
3988
#define MMQ_Y_Q3_K_PASCAL 64
3930
3989
#define NWARPS_Q3_K_PASCAL 8
@@ -3985,9 +4044,15 @@ template <bool need_check> static __global__ void
3985
4044
#define MMQ_X_Q4_K_RDNA1 32
3986
4045
#define MMQ_Y_Q4_K_RDNA1 64
3987
4046
#define NWARPS_Q4_K_RDNA1 8
4047
+ #if defined(CUDA_USE_TENSOR_CORES)
4048
+ #define MMQ_X_Q4_K_AMPERE 4
4049
+ #define MMQ_Y_Q4_K_AMPERE 32
4050
+ #define NWARPS_Q4_K_AMPERE 4
4051
+ #else
3988
4052
#define MMQ_X_Q4_K_AMPERE 64
3989
4053
#define MMQ_Y_Q4_K_AMPERE 128
3990
4054
#define NWARPS_Q4_K_AMPERE 4
4055
+ #endif
3991
4056
#define MMQ_X_Q4_K_PASCAL 64
3992
4057
#define MMQ_Y_Q4_K_PASCAL 64
3993
4058
#define NWARPS_Q4_K_PASCAL 8
@@ -4048,9 +4113,15 @@ template <bool need_check> static __global__ void
4048
4113
#define MMQ_X_Q5_K_RDNA1 32
4049
4114
#define MMQ_Y_Q5_K_RDNA1 64
4050
4115
#define NWARPS_Q5_K_RDNA1 8
4116
+ #if defined(CUDA_USE_TENSOR_CORES)
4117
+ #define MMQ_X_Q5_K_AMPERE 4
4118
+ #define MMQ_Y_Q5_K_AMPERE 32
4119
+ #define NWARPS_Q5_K_AMPERE 4
4120
+ #else
4051
4121
#define MMQ_X_Q5_K_AMPERE 64
4052
4122
#define MMQ_Y_Q5_K_AMPERE 128
4053
4123
#define NWARPS_Q5_K_AMPERE 4
4124
+ #endif
4054
4125
#define MMQ_X_Q5_K_PASCAL 64
4055
4126
#define MMQ_Y_Q5_K_PASCAL 64
4056
4127
#define NWARPS_Q5_K_PASCAL 8
@@ -4109,9 +4180,15 @@ mul_mat_q5_K(
4109
4180
#define MMQ_X_Q6_K_RDNA1 32
4110
4181
#define MMQ_Y_Q6_K_RDNA1 64
4111
4182
#define NWARPS_Q6_K_RDNA1 8
4183
+ #if defined(CUDA_USE_TENSOR_CORES)
4184
+ #define MMQ_X_Q6_K_AMPERE 4
4185
+ #define MMQ_Y_Q6_K_AMPERE 32
4186
+ #define NWARPS_Q6_K_AMPERE 4
4187
+ #else
4112
4188
#define MMQ_X_Q6_K_AMPERE 64
4113
4189
#define MMQ_Y_Q6_K_AMPERE 64
4114
4190
#define NWARPS_Q6_K_AMPERE 4
4191
+ #endif
4115
4192
#define MMQ_X_Q6_K_PASCAL 64
4116
4193
#define MMQ_Y_Q6_K_PASCAL 64
4117
4194
#define NWARPS_Q6_K_PASCAL 8
@@ -5663,6 +5740,16 @@ void ggml_init_cublas() {
5663
5740
CUDA_CHECK (cudaGetDeviceCount (&g_device_count));
5664
5741
GGML_ASSERT (g_device_count <= GGML_CUDA_MAX_DEVICES);
5665
5742
int64_t total_vram = 0 ;
5743
+ #if defined(GGML_CUDA_FORCE_MMQ)
5744
+ fprintf (stderr, " %s: GGML_CUDA_FORCE_MMQ: yes\n " , __func__);
5745
+ #else
5746
+ fprintf (stderr, " %s: GGML_CUDA_FORCE_MMQ: no\n " , __func__);
5747
+ #endif
5748
+ #if defined(CUDA_USE_TENSOR_CORES)
5749
+ fprintf (stderr, " %s: CUDA_USE_TENSOR_CORES: yes\n " , __func__);
5750
+ #else
5751
+ fprintf (stderr, " %s: CUDA_USE_TENSOR_CORES: no\n " , __func__);
5752
+ #endif
5666
5753
fprintf (stderr, " %s: found %d " GGML_CUDA_NAME " devices:\n " , __func__, g_device_count);
5667
5754
for (int id = 0 ; id < g_device_count; ++id) {
5668
5755
cudaDeviceProp prop;
@@ -6347,7 +6434,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6347
6434
cublasSgemm (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6348
6435
row_diff, src1_ncols, ne10,
6349
6436
&alpha, src0_ddf_i, ne00,
6350
- src1_ddf_i, ne10,
6437
+ src1_ddf_i, ne10,
6351
6438
&beta, dst_dd_i, ldc));
6352
6439
6353
6440
if (src0_as != 0 ) {
@@ -7048,9 +7135,10 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7048
7135
ggml_mul_mat_vec_nc_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7049
7136
}
7050
7137
7051
- static void ggml_cuda_mul_mat_mat_batched_cublas (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7138
+ static void ggml_cuda_mul_mat_mat_batched_cublas (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7052
7139
GGML_ASSERT (!ggml_is_transposed (src0));
7053
7140
GGML_ASSERT (!ggml_is_transposed (src1));
7141
+
7054
7142
GGML_ASSERT (src0->backend != GGML_BACKEND_GPU_SPLIT);
7055
7143
GGML_ASSERT (src0->type == GGML_TYPE_F16);
7056
7144
GGML_ASSERT (src1->type == GGML_TYPE_F32);
@@ -7202,17 +7290,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7202
7290
}
7203
7291
7204
7292
static void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7205
- bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7206
- src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7293
+ const bool all_on_device =
7294
+ (src0->backend == GGML_BACKEND_GPU) &&
7295
+ (src1->backend == GGML_BACKEND_GPU) &&
7296
+ ( dst->backend == GGML_BACKEND_GPU);
7207
7297
7208
7298
int64_t min_compute_capability = INT_MAX;
7209
7299
for (int64_t id = 0 ; id < g_device_count; ++id) {
7210
- if (min_compute_capability > g_compute_capabilities[id]
7211
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1 ] : 1 .0f )) {
7300
+ if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1 ] : 1 .0f )) {
7212
7301
min_compute_capability = g_compute_capabilities[id];
7213
7302
}
7214
7303
}
7215
7304
7305
+ #ifdef CUDA_USE_TENSOR_CORES
7306
+ const bool use_tensor_cores = true ;
7307
+ #else
7308
+ const bool use_tensor_cores = false ;
7309
+ #endif
7310
+
7216
7311
// debug helpers
7217
7312
// printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7218
7313
// printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -7221,20 +7316,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7221
7316
// printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7222
7317
// printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7223
7318
7224
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
7319
+ if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
7225
7320
// KQ single-batch
7226
7321
ggml_cuda_mul_mat_vec_p021 (src0, src1, dst);
7227
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous (src0) && !ggml_is_transposed (src1) && src1->ne [1 ] == 1 ) {
7322
+ } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous (src0) && !ggml_is_transposed (src1) && src1->ne [1 ] == 1 ) {
7228
7323
// KQV single-batch
7229
7324
ggml_cuda_mul_mat_vec_nc (src0, src1, dst);
7230
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1-> ne [ 2 ]*src1-> ne [ 3 ] > 1 ) {
7325
+ } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1)) {
7231
7326
// KQ + KQV multi-batch
7232
7327
ggml_cuda_mul_mat_mat_batched_cublas (src0, src1, dst);
7233
7328
} else if (src0->type == GGML_TYPE_F32) {
7234
7329
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false );
7235
7330
} else if (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16) {
7236
7331
if (src1->ne [1 ] == 1 && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 ) {
7237
-
7238
7332
#ifdef GGML_CUDA_FORCE_DMMV
7239
7333
const bool use_mul_mat_vec_q = false ;
7240
7334
#else
@@ -7247,7 +7341,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7247
7341
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false );
7248
7342
}
7249
7343
} else {
7250
- if (g_mul_mat_q && ggml_is_quantized (src0->type ) && min_compute_capability >= MIN_CC_DP4A) {
7344
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized (src0->type );
7345
+
7346
+ // when tensor cores are available, use them for large batch size
7347
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7348
+ if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne [1 ] > MMQ_MAX_BATCH_SIZE) {
7349
+ use_mul_mat_q = false ;
7350
+ }
7351
+
7352
+ if (use_mul_mat_q) {
7251
7353
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_q, true );
7252
7354
} else {
7253
7355
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false );
@@ -7601,10 +7703,6 @@ void ggml_cuda_set_main_device(const int main_device) {
7601
7703
}
7602
7704
}
7603
7705
7604
- void ggml_cuda_set_mul_mat_q (const bool mul_mat_q) {
7605
- g_mul_mat_q = mul_mat_q;
7606
- }
7607
-
7608
7706
void ggml_cuda_set_scratch_size (const size_t scratch_size) {
7609
7707
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7610
7708
// it still won't always work as expected, but it's better than nothing
0 commit comments