Skip to content

Commit db3abcc

Browse files
authored
sync : ggml (ggml-backend) (#3548)
* sync : ggml (ggml-backend) ggml-ci * zig : add ggml-backend to the build
1 parent eee42c6 commit db3abcc

15 files changed

+1285
-268
lines changed

CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,8 @@ add_library(ggml OBJECT
663663
ggml.h
664664
ggml-alloc.c
665665
ggml-alloc.h
666+
ggml-backend.c
667+
ggml-backend.h
666668
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
667669
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
668670
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}

Makefile

+5-2
Original file line numberDiff line numberDiff line change
@@ -512,9 +512,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
512512
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
513513
$(CC) $(CFLAGS) -c $< -o $@
514514

515-
OBJS += ggml-alloc.o
515+
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
516+
$(CC) $(CFLAGS) -c $< -o $@
517+
518+
OBJS += ggml-alloc.o ggml-backend.o
516519

517-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
520+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
518521
$(CXX) $(CXXFLAGS) -c $< -o $@
519522

520523
common.o: common/common.cpp common/common.h build-info.h common/log.h

build.zig

+8-7
Original file line numberDiff line numberDiff line change
@@ -124,20 +124,21 @@ pub fn build(b: *std.build.Builder) !void {
124124

125125
const ggml = make.obj("ggml", "ggml.c");
126126
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
127+
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
127128
const llama = make.obj("llama", "llama.cpp");
128129
const common = make.obj("common", "common/common.cpp");
129130
const console = make.obj("console", "common/console.cpp");
130131
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
131132
const train = make.obj("train", "common/train.cpp");
132133

133-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
134-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
135-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
136-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
137-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train });
138-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train });
134+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
135+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
136+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
137+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
138+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
139+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
139140

140-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
141+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
141142
if (server.target.isWindows()) {
142143
server.linkSystemLibrary("ws2_32");
143144
}

ggml-alloc.c

+62-107
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,12 @@
11
#include "ggml-alloc.h"
2+
#include "ggml-backend.h"
23
#include "ggml.h"
34
#include <assert.h>
45
#include <stdarg.h>
56
#include <stdio.h>
67
#include <stdlib.h>
78
#include <string.h>
89

9-
#ifdef __has_include
10-
#if __has_include(<unistd.h>)
11-
#include <unistd.h>
12-
#if defined(_POSIX_MAPPED_FILES)
13-
#include <sys/types.h>
14-
#include <sys/mman.h>
15-
#endif
16-
#endif
17-
#endif
18-
19-
#if defined(_WIN32)
20-
#define WIN32_LEAN_AND_MEAN
21-
#ifndef NOMINMAX
22-
#define NOMINMAX
23-
#endif
24-
#include <windows.h>
25-
#include <memoryapi.h>
26-
#endif
27-
2810

2911
#define UNUSED(x) (void)(x)
3012
#define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
8062
#define MAX_FREE_BLOCKS 256
8163

8264
struct ggml_allocr {
65+
struct ggml_backend_buffer * buffer;
66+
bool buffer_owned;
8367
void * data;
84-
size_t size;
8568
size_t alignment;
8669
int n_free_blocks;
8770
struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -119,28 +102,20 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
119102
}
120103
#endif
121104

122-
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
123-
return ggml_nbytes(tensor);
124-
125-
UNUSED(alloc);
126-
}
127-
128105
// check if a tensor is allocated by this buffer
129106
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
130-
void * ptr = tensor->data;
131-
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
107+
return tensor->buffer == alloc->buffer;
132108
}
133109

134110
static bool ggml_is_view(struct ggml_tensor * t) {
135111
return t->view_src != NULL;
136112
}
137113

138114
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
139-
#ifdef GGML_ALLOCATOR_DEBUG
140115
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
141116
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
142-
#endif
143-
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
117+
118+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
144119
size = aligned_offset(NULL, size, alloc->alignment);
145120

146121
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
188163

189164
tensor->data = addr;
190165
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166+
tensor->buffer = alloc->buffer;
167+
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
191168

192169
#ifdef GGML_ALLOCATOR_DEBUG
193170
add_allocated_tensor(alloc, tensor);
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
208185

209186
// this is a very naive implementation, but for our case the number of free blocks should be very small
210187
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
211-
void * ptr = tensor->data;
212-
213188
if (ggml_allocr_is_own(alloc, tensor) == false) {
214189
// the tensor was not allocated in this buffer
215190
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
216191
// the easiest way to deal with this is just to ignore it
192+
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
217193
return;
218194
}
219195

220-
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
196+
void * ptr = tensor->data;
197+
198+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
221199
size = aligned_offset(NULL, size, alloc->alignment);
222200
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
223-
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
201+
202+
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
224203

225204
#ifdef GGML_ALLOCATOR_DEBUG
226205
remove_allocated_tensor(alloc, tensor);
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
285264
alloc->n_free_blocks = 1;
286265
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
287266
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
288-
alloc->free_blocks[0].size = alloc->size - align_offset;
267+
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
289268
}
290269

291270
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
292-
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
271+
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272+
273+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
293274

294275
*alloc = (struct ggml_allocr){
295-
/*.data = */ data,
296-
/*.size = */ size,
276+
/*.buffer = */ buffer,
277+
/*.buffer_owned = */ true,
278+
/*.base = */ ggml_backend_buffer_get_base(buffer),
297279
/*.alignment = */ alignment,
298280
/*.n_free_blocks = */ 0,
299281
/*.free_blocks = */ {{0}},
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
312294
return alloc;
313295
}
314296

315-
// OS specific functions to allocate and free uncommitted virtual memory
316-
static void * alloc_vmem(size_t size) {
317-
#if defined(_WIN32)
318-
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
319-
#elif defined(_POSIX_MAPPED_FILES)
320-
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
321-
if (ptr == MAP_FAILED) {
322-
return NULL;
323-
}
324-
return ptr;
325-
#else
326-
// use a fixed address for other platforms
327-
uintptr_t base_addr = (uintptr_t)-size - 0x100;
328-
return (void *)base_addr;
329-
#endif
330-
}
331-
332-
static void free_vmem(void * base_addr, size_t size) {
333-
#if defined(_WIN32)
334-
VirtualFree(base_addr, 0, MEM_RELEASE);
335-
UNUSED(size);
336-
#elif defined(_POSIX_MAPPED_FILES)
337-
munmap(base_addr, size);
338-
#else
339-
// nothing to do
340-
UNUSED(base_addr);
341-
UNUSED(size);
342-
#endif
343-
}
344-
345-
// allocate uncommitted virtual memory to measure the size of the graph
346-
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
347-
// 128GB for 64-bit, 1GB for 32-bit
348-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
349-
do {
350-
*base_addr = alloc_vmem(*size);
351-
if (*base_addr != NULL) {
352-
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
353-
return;
354-
}
355-
// try again with half the size
356-
*size /= 2;
357-
} while (*size > 0);
358-
359-
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
360-
}
361-
362-
static void free_measure_vmem(void * base_addr, size_t size) {
363-
free_vmem(base_addr, size);
364-
}
365-
366297
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
367-
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
298+
struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
299+
alloc->measure = true;
368300

369-
void * base_addr;
370-
size_t size;
301+
return alloc;
302+
}
371303

372-
alloc_measure_vmem(&base_addr, &size);
304+
struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
373306

374307
*alloc = (struct ggml_allocr){
375-
/*.data = */ base_addr,
376-
/*.size = */ size,
377-
/*.alignment = */ alignment,
308+
/*.buffer = */ buffer,
309+
/*.buffer_owned = */ false,
310+
/*.base = */ ggml_backend_buffer_get_base(buffer),
311+
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
378312
/*.n_free_blocks = */ 0,
379313
/*.free_blocks = */ {{0}},
380314
/*.hash_table = */ {{0}},
381315
/*.max_size = */ 0,
382-
/*.measure = */ true,
316+
/*.measure = */ false,
383317
/*.parse_seq = */ {0},
384318
/*.parse_seq_len = */ 0,
385319
#ifdef GGML_ALLOCATOR_DEBUG
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
393327
}
394328

395329
void ggml_allocr_free(struct ggml_allocr * alloc) {
396-
if (alloc->measure) {
397-
free_measure_vmem(alloc->data, alloc->size);
330+
if (alloc->buffer_owned) {
331+
ggml_backend_buffer_free(alloc->buffer);
398332
}
399333
free(alloc);
400334
}
@@ -437,20 +371,30 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
437371
case GGML_OP_ROPE:
438372
case GGML_OP_RMS_NORM:
439373
case GGML_OP_SOFT_MAX:
440-
case GGML_OP_CONT:
441374
return true;
442375

443376
default:
444377
return false;
445378
}
446379
}
447380

381+
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
382+
assert(view->view_src != NULL && view->view_src->data != NULL);
383+
view->backend = view->view_src->backend;
384+
view->buffer = view->view_src->buffer;
385+
view->data = (char *)view->view_src->data + view->view_offs;
386+
387+
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
388+
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
389+
assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
390+
ggml_backend_buffer_init_tensor(alloc->buffer, view);
391+
}
392+
448393
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
449394
struct hash_node * ht = alloc->hash_table;
450395
if (node->data == NULL) {
451396
if (ggml_is_view(node)) {
452-
assert(node->view_src->data != NULL);
453-
node->data = (char *)node->view_src->data + node->view_offs;
397+
init_view(alloc, node);
454398
} else {
455399
// see if we can reuse a parent's buffer (inplace)
456400
if (ggml_op_can_inplace(node->op)) {
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
478422
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
479423
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
480424
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
481-
node->data = parent->data;
425+
node->view_src = view_src;
426+
view_src_hn->n_views += 1;
427+
init_view(alloc, node);
482428
return;
483429
}
484430
}
485431
else {
486432
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
487-
node->data = parent->data;
433+
node->view_src = parent;
434+
p_hn->n_views += 1;
435+
init_view(alloc, node);
488436
return;
489437
}
490438
}
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
495443
}
496444
}
497445

498-
static size_t ggml_allocr_alloc_graph_tensors_n(
446+
size_t ggml_allocr_alloc_graph_n(
499447
struct ggml_allocr * alloc,
500448
struct ggml_cgraph ** graphs, int n_graphs,
501449
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
513461
if (ggml_is_view(node)) {
514462
struct ggml_tensor * view_src = node->view_src;
515463
hash_get(ht, view_src)->n_views += 1;
464+
if (node->buffer == NULL && node->data != NULL) {
465+
// view of a pre-allocated tensor, didn't call init_view() yet
466+
init_view(alloc, node);
467+
}
516468
}
517469

518470
for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
521473
break;
522474
}
523475
hash_get(ht, parent)->n_children += 1;
476+
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477+
init_view(alloc, parent);
478+
}
524479
}
525480
}
526481
}
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
631586
}
632587

633588
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
634-
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
589+
return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
635590
}
636591

637592
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {

0 commit comments

Comments
 (0)