@@ -6254,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
6254
6254
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6255
6255
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6256
6256
6257
- GGML_ASSERT (src0_dd_i != nullptr );
6257
+ GGML_ASSERT (src0_dd_i != nullptr );
6258
6258
GGML_ASSERT (src1_ddf_i != nullptr );
6259
- GGML_ASSERT (dst_dd_i != nullptr );
6260
-
6259
+ GGML_ASSERT (dst_dd_i != nullptr );
6261
6260
6262
6261
const int64_t ne00 = src0->ne [0 ];
6263
-
6264
6262
const int64_t ne10 = src1->ne [0 ];
6265
6263
6266
6264
const int64_t ne0 = dst->ne [0 ];
6265
+
6267
6266
const int64_t row_diff = row_high - row_low;
6268
6267
6269
6268
int id;
@@ -7223,12 +7222,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7223
7222
// printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7224
7223
7225
7224
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
7226
- // KQ
7225
+ // KQ single-batch
7227
7226
ggml_cuda_mul_mat_vec_p021 (src0, src1, dst);
7228
7227
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous (src0) && !ggml_is_transposed (src1) && src1->ne [1 ] == 1 ) {
7229
- // KQV
7228
+ // KQV single-batch
7230
7229
ggml_cuda_mul_mat_vec_nc (src0, src1, dst);
7231
7230
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
7231
+ // KQ + KQV multi-batch
7232
7232
ggml_cuda_mul_mat_mat_batched_cublas (src0, src1, dst);
7233
7233
} else if (src0->type == GGML_TYPE_F32) {
7234
7234
ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false );
0 commit comments