Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ggml_tensor_extra_gpu memory leak #2146

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 15 additions & 25 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif

struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};

static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
Expand Down Expand Up @@ -2193,7 +2189,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
src_ptr = (char *) src->data;
} else if (src->backend == GGML_BACKEND_GPU) {
kind = cudaMemcpyDeviceToDevice;
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
const struct ggml_tensor_extra_gpu * extra = (const ggml_tensor_extra_gpu *) src->extra;
int id;
CUDA_CHECK(cudaGetDevice(&id));
src_ptr = (char *) extra->data_device[id];
Expand Down Expand Up @@ -2631,9 +2627,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
const size_t src0_ts = ggml_type_size(src0->type);
const size_t src0_bs = ggml_blck_size(src0->type);

struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
const struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (const ggml_tensor_extra_gpu *) src1->extra : nullptr;
const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;

const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
const bool src0_is_contiguous = ggml_is_contiguous(src0);
Expand Down Expand Up @@ -2964,13 +2960,13 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
CUDA_CHECK(cudaSetDevice(g_main_device));
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];

struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
void * src0_ddq = src0_extra->data_device[g_main_device];

struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];

struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
Expand All @@ -2993,13 +2989,13 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
CUDA_CHECK(cudaSetDevice(g_main_device));
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];

struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
void * src0_ddq = src0_extra->data_device[g_main_device];

struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];

struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

const int row_stride_x = nb01 / sizeof(half);
Expand Down Expand Up @@ -3063,8 +3059,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
CUDA_CHECK(cudaSetDevice(g_main_device));
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];

const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;

char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
Expand Down Expand Up @@ -3107,7 +3103,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
int nrows = ggml_nrows(tensor);
const size_t nb1 = tensor->nb[1];
ggml_backend backend = tensor->backend;
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
memset(extra, 0, sizeof(*extra));

for (int id = 0; id < g_device_count; ++id) {
Expand Down Expand Up @@ -3148,16 +3144,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
}
}

tensor->extra = extra;
}

void ggml_cuda_free_data(struct ggml_tensor * tensor) {
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
return;
}

ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
const ggml_tensor_extra_gpu * extra = (const ggml_tensor_extra_gpu *) tensor->extra;

for (int id = 0; id < g_device_count; ++id) {
if (extra->data_device[id] != nullptr) {
Expand All @@ -3170,8 +3164,6 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
}
}

delete extra;
}

void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
Expand All @@ -3191,7 +3183,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
}

tensor->backend = GGML_BACKEND_GPU;
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu * ) tensor->extra;
memset(extra, 0, sizeof(*extra));

const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
Expand Down Expand Up @@ -3234,8 +3226,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
CUDA_CHECK(cudaMemset(data, 0, size));
extra->data_device[g_main_device] = data;
}

tensor->extra = extra;
}

void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
Expand Down
11 changes: 10 additions & 1 deletion ggml-cuda.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
#pragma once

#define GGML_CUDA_MAX_DEVICES 16

#include <cuda_runtime.h>

struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};

#include "ggml.h"

#ifdef __cplusplus
extern "C" {
#endif

#define GGML_CUDA_MAX_DEVICES 16


void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split);
Expand Down
2 changes: 0 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4588,8 +4588,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.perf_time_us =*/ 0,
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
/*.name =*/ { 0 },
/*.extra =*/ NULL,
/*.padding =*/ { 0 },
};

// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
Expand Down
10 changes: 7 additions & 3 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,10 @@
const type prefix##3 = (pointer)->array[3]; \
GGML_UNUSED(prefix##3);

#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -427,9 +431,9 @@ extern "C" {

char name[GGML_MAX_NAME];

void * extra; // extra things e.g. for ggml-cuda.cu

char padding[8];
#ifdef GGML_USE_CUBLAS
char extra[sizeof(struct ggml_tensor_extra_gpu)]; // extra things e.g. for ggml-cuda.cu
#endif
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
Expand Down