Skip to content

Commit 9d0693b

Browse files
kiltyjggerganov
andauthored
metal : use shared buffers between CPU and GPU (#1696)
* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU * Page-align buffers used by Metal * Remove trailing whitespace * Only import unistd.h for Metal builds * metal : remove unnecessary copies --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent efe0507 commit 9d0693b

File tree

4 files changed

+38
-16
lines changed

4 files changed

+38
-16
lines changed

ggml-metal.m

+14-3
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,25 @@ bool ggml_metal_add_buffer(
195195
}
196196
}
197197

198+
size_t page_size = getpagesize();
199+
size_t aligned_size = size;
200+
if ((aligned_size % page_size) != 0) {
201+
aligned_size += (page_size - (aligned_size % page_size));
202+
}
203+
198204
ctx->buffers[ctx->n_buffers].name = name;
199205
ctx->buffers[ctx->n_buffers].data = data;
200206
ctx->buffers[ctx->n_buffers].size = size;
201-
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
207+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
202208

203-
++ctx->n_buffers;
209+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
210+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
211+
return false;
212+
} else {
213+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
214+
}
204215

205-
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
216+
++ctx->n_buffers;
206217
}
207218

208219
return true;

ggml.c

+8
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
#include <float.h>
2323
#include <limits.h>
2424

25+
#ifdef GGML_USE_METAL
26+
#include <unistd.h>
27+
#endif
28+
2529
// if C99 - static_assert is noop
2630
// ref: https://stackoverflow.com/a/53923785/4039976
2731
#ifndef static_assert
@@ -122,7 +126,11 @@ typedef void* thread_ret_t;
122126
#else
123127
inline static void* ggml_aligned_malloc(size_t size) {
124128
void* aligned_memory = NULL;
129+
#ifdef GGML_USE_METAL
130+
int result = posix_memalign(&aligned_memory, getpagesize(), size);
131+
#else
125132
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
133+
#endif
126134
if (result != 0) {
127135
// Handle allocation failure
128136
return NULL;

llama-util.h

+16
Original file line numberDiff line numberDiff line change
@@ -405,13 +405,29 @@ struct llama_buffer {
405405
llama_buffer() = default;
406406

407407
void resize(size_t len) {
408+
#ifdef GGML_USE_METAL
409+
free(addr);
410+
int result = posix_memalign((void **) &addr, getpagesize(), len);
411+
if (result == 0) {
412+
memset(addr, 0, len);
413+
}
414+
else {
415+
addr = NULL;
416+
}
417+
#else
408418
delete[] addr;
409419
addr = new uint8_t[len];
420+
#endif
410421
size = len;
411422
}
412423

413424
~llama_buffer() {
425+
#ifdef GGML_USE_METAL
426+
free(addr);
427+
#else
414428
delete[] addr;
429+
#endif
430+
addr = NULL;
415431
}
416432

417433
// disable copy and move

llama.cpp

-13
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ enum e_model {
5353
MODEL_65B,
5454
};
5555

56-
5756
static const size_t MB = 1024*1024;
5857

5958
// computed for n_ctx == 2048
@@ -1281,12 +1280,6 @@ static bool llama_eval_internal(
12811280
ggml_set_name(embd, "embd");
12821281
memcpy(embd->data, tokens, N*ggml_element_size(embd));
12831282

1284-
#ifdef GGML_USE_METAL
1285-
if (lctx.ctx_metal && N == 1) {
1286-
ggml_metal_set_tensor(lctx.ctx_metal, embd);
1287-
}
1288-
#endif
1289-
12901283
struct ggml_tensor * cur;
12911284
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
12921285

@@ -1484,12 +1477,6 @@ static bool llama_eval_internal(
14841477
}
14851478

14861479
ggml_graph_compute(ctx0, &gf);
1487-
1488-
if (lctx.ctx_metal) {
1489-
// We need to sync the CPU KV cache with the GPU KV cache
1490-
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
1491-
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
1492-
}
14931480
}
14941481
#else
14951482
ggml_graph_compute(ctx0, &gf);

0 commit comments

Comments
 (0)