Skip to content

Commit af38d1d

Browse files
committed
Revert "CUDA: fix bad asserts for partial offload (ggml-org#13337)"
This reverts commit 2356fb1.
1 parent 4f1275d commit af38d1d

File tree

6 files changed

+6
-21
lines changed

6 files changed

+6
-21
lines changed

ggml/include/ggml.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -686,15 +686,11 @@ extern "C" {
686686
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
687687
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
688688

689-
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
690689
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
691690
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
692691
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
693692
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
694693

695-
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
696-
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
697-
698694
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
699695
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
700696

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,6 @@ void launch_fattn(
719719
size_t nb23 = V->nb[3];
720720

721721
if (need_f16_K && K->type != GGML_TYPE_F16) {
722-
GGML_ASSERT(ggml_is_contiguously_allocated(K));
723722
K_f16.alloc(ggml_nelements(K));
724723
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
725724
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
@@ -734,7 +733,6 @@ void launch_fattn(
734733
}
735734

736735
if (need_f16_V && V->type != GGML_TYPE_F16) {
737-
GGML_ASSERT(ggml_is_contiguously_allocated(V));
738736
V_f16.alloc(ggml_nelements(V));
739737
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
740738
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1537,8 +1537,6 @@ static void ggml_cuda_op_mul_mat(
15371537

15381538
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
15391539
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
1540-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
1541-
GGML_ASSERT(!src0->view_src);
15421540
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
15431541
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
15441542
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
@@ -2070,11 +2068,10 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
20702068
}
20712069

20722070
ggml_tensor src0_slice = *src0;
2073-
src0_slice.ne[2] = 1;
2074-
src0_slice.nb[3] = src0_slice.nb[2];
2075-
src0_slice.op = GGML_OP_VIEW;
2076-
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
2077-
src0_slice.data = (char *) src0->data + i02*nb02;
2071+
src0_slice.ne[2] = 1;
2072+
src0_slice.nb[3] = src0_slice.nb[2];
2073+
src0_slice.data = (char *) src0->data + i02*nb02;
2074+
GGML_ASSERT(!ggml_cuda_should_use_mmq(src0->type, cc, ne11) || ne00 % MATRIX_ROW_PADDING == 0);
20782075

20792076
ggml_tensor src1_slice;
20802077
memset(&src1_slice, 0, sizeof(src1_slice));

ggml/src/ggml-cuda/mmq.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ void ggml_cuda_mul_mat_q(
9191

9292
// If src0 is a temporary compute buffer, clear any potential padding.
9393
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
94-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
95-
GGML_ASSERT(!src0->view_src);
94+
GGML_ASSERT(ggml_is_contiguous(src0));
9695
const size_t size_data = ggml_nbytes(src0);
9796
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
9897
if (size_alloc > size_data) {

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,7 @@ void ggml_cuda_mul_mat_vec_q(
515515

516516
// If src0 is a temporary compute buffer, clear any potential padding.
517517
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
518-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
519-
GGML_ASSERT(!src0->view_src);
518+
GGML_ASSERT(ggml_is_contiguous(src0));
520519
const size_t size_data = ggml_nbytes(src0);
521520
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
522521
if (size_alloc > size_data) {

ggml/src/ggml.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,10 +1312,6 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
13121312
return ggml_is_contiguous_n(tensor, 2);
13131313
}
13141314

1315-
bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
1316-
return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
1317-
}
1318-
13191315
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
13201316
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
13211317

0 commit comments

Comments
 (0)