@@ -7930,8 +7930,12 @@ static bool ggml_compute_forward_mul_mat_use_blas(
7930
7930
const int64_t ne1 = dst -> ne [1 ];
7931
7931
7932
7932
// TODO: find the optimal values for these
7933
- if (ggml_is_contiguous (src0 ) &&
7934
- ggml_is_contiguous (src1 ) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32 ))) {
7933
+ if (
7934
+ #if !defined(GGML_USE_CUBLAS )
7935
+ ggml_is_contiguous (src0 ) &&
7936
+ ggml_is_contiguous (src1 ) &&
7937
+ #endif
7938
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32 ))) {
7935
7939
7936
7940
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
7937
7941
return true;
@@ -8041,15 +8045,16 @@ static void ggml_compute_forward_mul_mat_f32(
8041
8045
8042
8046
for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
8043
8047
for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
8048
+ #if !defined(GGML_USE_CUBLAS )
8044
8049
const float * x = (float * ) ((char * ) src0 -> data + i02 * nb02 + i03 * nb03 );
8045
8050
const float * y = (float * ) ((char * ) src1 -> data + i02 * nb12 + i03 * nb13 );
8046
-
8051
+ #endif
8047
8052
float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
8048
8053
8049
8054
#if defined(GGML_USE_CUBLAS )
8050
8055
// copy data to device
8051
- CUDA_CHECK (cudaMemcpyAsync (d_X , x , sizeof ( float ) * x_ne , cudaMemcpyHostToDevice , g_cudaStream ));
8052
- CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof ( float ) * y_ne , cudaMemcpyHostToDevice , g_cudaStream ));
8056
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
8057
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Y , src1 , i03 , i02 , g_cudaStream ));
8053
8058
8054
8059
// compute
8055
8060
CUBLAS_CHECK (
@@ -8269,13 +8274,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8269
8274
#endif
8270
8275
8271
8276
#if defined(GGML_USE_CUBLAS )
8272
- const ggml_fp16_t * x = (ggml_fp16_t * ) ((char * ) src0 -> data + i02 * nb02 + i03 * nb03 );
8273
8277
const ggml_fp16_t * y = (ggml_fp16_t * ) wdata ;
8274
8278
8275
8279
float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
8276
8280
8277
8281
// copy data to device
8278
- CUDA_CHECK (cudaMemcpyAsync (d_X , x , sizeof ( ggml_fp16_t ) * x_ne , cudaMemcpyHostToDevice , g_cudaStream ));
8282
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
8279
8283
CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof (ggml_fp16_t ) * y_ne , cudaMemcpyHostToDevice , g_cudaStream ));
8280
8284
8281
8285
// compute
@@ -8539,9 +8543,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
8539
8543
8540
8544
#if defined(GGML_USE_CUBLAS )
8541
8545
// copy and dequantize on device
8542
- CUDA_CHECK (
8543
- cudaMemcpyAsync (d_Q , (char * ) src0 -> data + i03 * nb03 + i02 * nb02 ,
8544
- GGML_TYPE_SIZE [type ] * x_ne / GGML_BLCK_SIZE [type ], cudaMemcpyHostToDevice , g_cudaStream ));
8546
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream ));
8545
8547
8546
8548
dequantize_row_q_cuda (d_Q , d_X , ne01 * ne00 , g_cudaStream );
8547
8549
CUDA_CHECK (cudaGetLastError ());
@@ -8561,7 +8563,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
8561
8563
8562
8564
#if defined(GGML_USE_CUBLAS )
8563
8565
// copy data to device
8564
- CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof ( float ) * y_ne , cudaMemcpyHostToDevice , g_cudaStream ));
8566
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Y , src1 , i03 , i02 , g_cudaStream ));
8565
8567
8566
8568
// compute
8567
8569
CUBLAS_CHECK (
0 commit comments