1
1
#include "ggml-alloc.h"
2
+ #include "ggml-backend.h"
2
3
#include "ggml.h"
3
4
#include <assert.h>
4
5
#include <stdarg.h>
5
6
#include <stdio.h>
6
7
#include <stdlib.h>
7
8
#include <string.h>
8
9
9
- #ifdef __has_include
10
- #if __has_include (< unistd .h > )
11
- #include <unistd.h>
12
- #if defined(_POSIX_MAPPED_FILES )
13
- #include <sys/types.h>
14
- #include <sys/mman.h>
15
- #endif
16
- #endif
17
- #endif
18
-
19
- #if defined(_WIN32 )
20
- #define WIN32_LEAN_AND_MEAN
21
- #ifndef NOMINMAX
22
- #define NOMINMAX
23
- #endif
24
- #include <windows.h>
25
- #include <memoryapi.h>
26
- #endif
27
-
28
10
29
11
#define UNUSED (x ) (void)(x)
30
12
#define MAX (a , b ) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
80
62
#define MAX_FREE_BLOCKS 256
81
63
82
64
struct ggml_allocr {
65
+ struct ggml_backend_buffer * buffer ;
66
+ bool buffer_owned ;
83
67
void * data ;
84
- size_t size ;
85
68
size_t alignment ;
86
69
int n_free_blocks ;
87
70
struct free_block free_blocks [MAX_FREE_BLOCKS ];
@@ -119,28 +102,20 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
119
102
}
120
103
#endif
121
104
122
- static size_t ggml_allocr_get_alloc_size (struct ggml_allocr * alloc , struct ggml_tensor * tensor ) {
123
- return ggml_nbytes (tensor );
124
-
125
- UNUSED (alloc );
126
- }
127
-
128
105
// check if a tensor is allocated by this buffer
129
106
static bool ggml_allocr_is_own (struct ggml_allocr * alloc , const struct ggml_tensor * tensor ) {
130
- void * ptr = tensor -> data ;
131
- return ptr >= alloc -> data && (char * )ptr < (char * )alloc -> data + alloc -> max_size ;
107
+ return tensor -> buffer == alloc -> buffer ;
132
108
}
133
109
134
110
static bool ggml_is_view (struct ggml_tensor * t ) {
135
111
return t -> view_src != NULL ;
136
112
}
137
113
138
114
void ggml_allocr_alloc (struct ggml_allocr * alloc , struct ggml_tensor * tensor ) {
139
- #ifdef GGML_ALLOCATOR_DEBUG
140
115
GGML_ASSERT (!ggml_is_view (tensor )); // views generally get data pointer from one of their sources
141
116
GGML_ASSERT (tensor -> data == NULL ); // avoid allocating tensor which already has memory allocated
142
- #endif
143
- size_t size = ggml_allocr_get_alloc_size (alloc , tensor );
117
+
118
+ size_t size = ggml_backend_buffer_get_alloc_size (alloc -> buffer , tensor );
144
119
size = aligned_offset (NULL , size , alloc -> alignment );
145
120
146
121
AT_PRINTF ("%s: allocating %s (%zu bytes) - " , __func__ , tensor -> name , size );
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
188
163
189
164
tensor -> data = addr ;
190
165
AT_PRINTF ("%s: allocated data at %p\n" , __func__ , tensor -> data );
166
+ tensor -> buffer = alloc -> buffer ;
167
+ ggml_backend_buffer_init_tensor (alloc -> buffer , tensor );
191
168
192
169
#ifdef GGML_ALLOCATOR_DEBUG
193
170
add_allocated_tensor (alloc , tensor );
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
208
185
209
186
// this is a very naive implementation, but for our case the number of free blocks should be very small
210
187
static void ggml_allocr_free_tensor (struct ggml_allocr * alloc , struct ggml_tensor * tensor ) {
211
- void * ptr = tensor -> data ;
212
-
213
188
if (ggml_allocr_is_own (alloc , tensor ) == false) {
214
189
// the tensor was not allocated in this buffer
215
190
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
216
191
// the easiest way to deal with this is just to ignore it
192
+ AT_PRINTF ("ignoring %s (their buffer: %p, our buffer: %p)\n" , tensor -> name , (void * )tensor -> buffer , (void * )alloc -> buffer );
217
193
return ;
218
194
}
219
195
220
- size_t size = ggml_allocr_get_alloc_size (alloc , tensor );
196
+ void * ptr = tensor -> data ;
197
+
198
+ size_t size = ggml_backend_buffer_get_alloc_size (alloc -> buffer , tensor );
221
199
size = aligned_offset (NULL , size , alloc -> alignment );
222
200
AT_PRINTF ("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n" , __func__ , tensor -> name , ptr , size , alloc -> n_free_blocks );
223
- AT_PRINTF ("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n" , __func__ , alloc -> data , (char * )alloc -> data + alloc -> size , (char * )alloc -> data + alloc -> max_size );
201
+
202
+ ggml_backend_buffer_free_tensor (alloc -> buffer , tensor );
224
203
225
204
#ifdef GGML_ALLOCATOR_DEBUG
226
205
remove_allocated_tensor (alloc , tensor );
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
285
264
alloc -> n_free_blocks = 1 ;
286
265
size_t align_offset = aligned_offset (alloc -> data , 0 , alloc -> alignment );
287
266
alloc -> free_blocks [0 ].addr = (char * )alloc -> data + align_offset ;
288
- alloc -> free_blocks [0 ].size = alloc -> size - align_offset ;
267
+ alloc -> free_blocks [0 ].size = ggml_backend_buffer_get_size ( alloc -> buffer ) - align_offset ;
289
268
}
290
269
291
270
struct ggml_allocr * ggml_allocr_new (void * data , size_t size , size_t alignment ) {
292
- struct ggml_allocr * alloc = (struct ggml_allocr * )malloc (sizeof (struct ggml_allocr ) /* + n_free_blocks * sizeof(struct free_block) */ );
271
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr (NULL , data , size );
272
+
273
+ struct ggml_allocr * alloc = (struct ggml_allocr * )malloc (sizeof (struct ggml_allocr ));
293
274
294
275
* alloc = (struct ggml_allocr ){
295
- /*.data = */ data ,
296
- /*.size = */ size ,
276
+ /*.buffer = */ buffer ,
277
+ /*.buffer_owned = */ true,
278
+ /*.base = */ ggml_backend_buffer_get_base (buffer ),
297
279
/*.alignment = */ alignment ,
298
280
/*.n_free_blocks = */ 0 ,
299
281
/*.free_blocks = */ {{0 }},
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
312
294
return alloc ;
313
295
}
314
296
315
- // OS specific functions to allocate and free uncommitted virtual memory
316
- static void * alloc_vmem (size_t size ) {
317
- #if defined(_WIN32 )
318
- return VirtualAlloc (NULL , size , MEM_RESERVE , PAGE_NOACCESS );
319
- #elif defined(_POSIX_MAPPED_FILES )
320
- void * ptr = mmap (NULL , size , PROT_NONE , MAP_PRIVATE | MAP_ANON , -1 , 0 );
321
- if (ptr == MAP_FAILED ) {
322
- return NULL ;
323
- }
324
- return ptr ;
325
- #else
326
- // use a fixed address for other platforms
327
- uintptr_t base_addr = (uintptr_t )- size - 0x100 ;
328
- return (void * )base_addr ;
329
- #endif
330
- }
331
-
332
- static void free_vmem (void * base_addr , size_t size ) {
333
- #if defined(_WIN32 )
334
- VirtualFree (base_addr , 0 , MEM_RELEASE );
335
- UNUSED (size );
336
- #elif defined(_POSIX_MAPPED_FILES )
337
- munmap (base_addr , size );
338
- #else
339
- // nothing to do
340
- UNUSED (base_addr );
341
- UNUSED (size );
342
- #endif
343
- }
344
-
345
- // allocate uncommitted virtual memory to measure the size of the graph
346
- static void alloc_measure_vmem (void * * base_addr , size_t * size ) {
347
- // 128GB for 64-bit, 1GB for 32-bit
348
- * size = sizeof (void * ) == 4 ? 1ULL <<30 : 1ULL <<37 ;
349
- do {
350
- * base_addr = alloc_vmem (* size );
351
- if (* base_addr != NULL ) {
352
- AT_PRINTF ("allocated %.2f GB of virtual memory for measure buffer at %p\n" , * size / 1024.0 / 1024.0 / 1024.0 , * base_addr );
353
- return ;
354
- }
355
- // try again with half the size
356
- * size /= 2 ;
357
- } while (* size > 0 );
358
-
359
- GGML_ASSERT (!"failed to allocate virtual memory for measure buffer" );
360
- }
361
-
362
- static void free_measure_vmem (void * base_addr , size_t size ) {
363
- free_vmem (base_addr , size );
364
- }
365
-
366
297
struct ggml_allocr * ggml_allocr_new_measure (size_t alignment ) {
367
- struct ggml_allocr * alloc = (struct ggml_allocr * )malloc (sizeof (struct ggml_allocr ) /* + n_free_blocks * sizeof(struct free_block) */ );
298
+ struct ggml_allocr * alloc = ggml_allocr_new ((void * )0x1000 , (size_t )-0x1001 , alignment );
299
+ alloc -> measure = true;
368
300
369
- void * base_addr ;
370
- size_t size ;
301
+ return alloc ;
302
+ }
371
303
372
- alloc_measure_vmem (& base_addr , & size );
304
+ struct ggml_allocr * ggml_allocr_new_from_buffer (struct ggml_backend_buffer * buffer ) {
305
+ struct ggml_allocr * alloc = (struct ggml_allocr * )malloc (sizeof (struct ggml_allocr ));
373
306
374
307
* alloc = (struct ggml_allocr ){
375
- /*.data = */ base_addr ,
376
- /*.size = */ size ,
377
- /*.alignment = */ alignment ,
308
+ /*.buffer = */ buffer ,
309
+ /*.buffer_owned = */ false,
310
+ /*.base = */ ggml_backend_buffer_get_base (buffer ),
311
+ /*.alignment = */ ggml_backend_buffer_get_alignment (buffer ),
378
312
/*.n_free_blocks = */ 0 ,
379
313
/*.free_blocks = */ {{0 }},
380
314
/*.hash_table = */ {{0 }},
381
315
/*.max_size = */ 0 ,
382
- /*.measure = */ true ,
316
+ /*.measure = */ false ,
383
317
/*.parse_seq = */ {0 },
384
318
/*.parse_seq_len = */ 0 ,
385
319
#ifdef GGML_ALLOCATOR_DEBUG
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
393
327
}
394
328
395
329
void ggml_allocr_free (struct ggml_allocr * alloc ) {
396
- if (alloc -> measure ) {
397
- free_measure_vmem (alloc -> data , alloc -> size );
330
+ if (alloc -> buffer_owned ) {
331
+ ggml_backend_buffer_free (alloc -> buffer );
398
332
}
399
333
free (alloc );
400
334
}
@@ -437,20 +371,30 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
437
371
case GGML_OP_ROPE :
438
372
case GGML_OP_RMS_NORM :
439
373
case GGML_OP_SOFT_MAX :
440
- case GGML_OP_CONT :
441
374
return true;
442
375
443
376
default :
444
377
return false;
445
378
}
446
379
}
447
380
381
+ static void init_view (struct ggml_allocr * alloc , struct ggml_tensor * view ) {
382
+ assert (view -> view_src != NULL && view -> view_src -> data != NULL );
383
+ view -> backend = view -> view_src -> backend ;
384
+ view -> buffer = view -> view_src -> buffer ;
385
+ view -> data = (char * )view -> view_src -> data + view -> view_offs ;
386
+
387
+ // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
388
+ // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
389
+ assert (ggml_allocr_is_measure (alloc ) || view -> buffer -> backend == alloc -> buffer -> backend );
390
+ ggml_backend_buffer_init_tensor (alloc -> buffer , view );
391
+ }
392
+
448
393
static void allocate_node (struct ggml_allocr * alloc , struct ggml_tensor * node ) {
449
394
struct hash_node * ht = alloc -> hash_table ;
450
395
if (node -> data == NULL ) {
451
396
if (ggml_is_view (node )) {
452
- assert (node -> view_src -> data != NULL );
453
- node -> data = (char * )node -> view_src -> data + node -> view_offs ;
397
+ init_view (alloc , node );
454
398
} else {
455
399
// see if we can reuse a parent's buffer (inplace)
456
400
if (ggml_op_can_inplace (node -> op )) {
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
478
422
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
479
423
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
480
424
AT_PRINTF ("reusing view parent %s (%s) for %s\n" , parent -> name , view_src -> name , node -> name );
481
- node -> data = parent -> data ;
425
+ node -> view_src = view_src ;
426
+ view_src_hn -> n_views += 1 ;
427
+ init_view (alloc , node );
482
428
return ;
483
429
}
484
430
}
485
431
else {
486
432
AT_PRINTF ("reusing parent %s for %s\n" , parent -> name , node -> name );
487
- node -> data = parent -> data ;
433
+ node -> view_src = parent ;
434
+ p_hn -> n_views += 1 ;
435
+ init_view (alloc , node );
488
436
return ;
489
437
}
490
438
}
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
495
443
}
496
444
}
497
445
498
- static size_t ggml_allocr_alloc_graph_tensors_n (
446
+ size_t ggml_allocr_alloc_graph_n (
499
447
struct ggml_allocr * alloc ,
500
448
struct ggml_cgraph * * graphs , int n_graphs ,
501
449
struct ggml_tensor * * * inputs , struct ggml_tensor * * * outputs ) {
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
513
461
if (ggml_is_view (node )) {
514
462
struct ggml_tensor * view_src = node -> view_src ;
515
463
hash_get (ht , view_src )-> n_views += 1 ;
464
+ if (node -> buffer == NULL && node -> data != NULL ) {
465
+ // view of a pre-allocated tensor, didn't call init_view() yet
466
+ init_view (alloc , node );
467
+ }
516
468
}
517
469
518
470
for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
521
473
break ;
522
474
}
523
475
hash_get (ht , parent )-> n_children += 1 ;
476
+ if (ggml_is_view (parent ) && parent -> buffer == NULL && parent -> data != NULL ) {
477
+ init_view (alloc , parent );
478
+ }
524
479
}
525
480
}
526
481
}
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
631
586
}
632
587
633
588
size_t ggml_allocr_alloc_graph (struct ggml_allocr * alloc , struct ggml_cgraph * graph ) {
634
- return ggml_allocr_alloc_graph_tensors_n (alloc , & graph , 1 , NULL , NULL );
589
+ return ggml_allocr_alloc_graph_n (alloc , & graph , 1 , NULL , NULL );
635
590
}
636
591
637
592
size_t ggml_allocr_max_size (struct ggml_allocr * alloc ) {
0 commit comments