@@ -1537,8 +1537,6 @@ static void ggml_cuda_op_mul_mat(
15371537
15381538 // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
15391539 if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized (src0->type ) && ggml_backend_buffer_get_usage (src0->buffer ) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr ) {
1540- GGML_ASSERT (ggml_is_contiguously_allocated (src0));
1541- GGML_ASSERT (!src0->view_src );
15421540 const size_t nbytes_data = ggml_row_size (src0->type , (dev[id].row_high - dev[id].row_low )*ne00);
15431541 const size_t nbytes_padding = ggml_row_size (src0->type , MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
15441542 CUDA_CHECK (cudaMemsetAsync (dev[id].src0_dd + nbytes_data, 0 , nbytes_padding, stream));
@@ -2070,11 +2068,10 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
20702068 }
20712069
20722070 ggml_tensor src0_slice = *src0;
2073- src0_slice.ne [2 ] = 1 ;
2074- src0_slice.nb [3 ] = src0_slice.nb [2 ];
2075- src0_slice.op = GGML_OP_VIEW;
2076- src0_slice.view_src = dst->src [0 ]; // non-const pointer to src0
2077- src0_slice.data = (char *) src0->data + i02*nb02;
2071+ src0_slice.ne [2 ] = 1 ;
2072+ src0_slice.nb [3 ] = src0_slice.nb [2 ];
2073+ src0_slice.data = (char *) src0->data + i02*nb02;
2074+ GGML_ASSERT (!ggml_cuda_should_use_mmq (src0->type , cc, ne11) || ne00 % MATRIX_ROW_PADDING == 0 );
20782075
20792076 ggml_tensor src1_slice;
20802077 memset (&src1_slice, 0 , sizeof (src1_slice));
0 commit comments