Skip to content

Commit 4dad9fa

Browse files
committed
metal : use residency sets
1 parent 6da9021 commit 4dad9fa

File tree

1 file changed

+162
-5
lines changed

1 file changed

+162
-5
lines changed

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 162 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
// max number of MTLCommandBuffer used to submit a graph for processing
2020
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
2121

22+
#define GGML_METAL_MAX_RESIDENCY_SETS 128
23+
2224
#define UNUSED(x) (void)(x)
2325

2426
// globals
@@ -37,6 +39,9 @@
3739
id<MTLDevice> mtl_device;
3840
int mtl_device_ref_count;
3941

42+
id<MTLResidencySet> mtl_residency_set[GGML_METAL_MAX_RESIDENCY_SETS];
43+
int mtl_residency_set_n;
44+
4045
bool has_simdgroup_reduction;
4146
bool has_simdgroup_mm;
4247
bool has_bfloat;
@@ -46,6 +51,8 @@
4651
} g_ggml_ctx_dev_main = {
4752
/*.mtl_device =*/ nil,
4853
/*.mtl_device_ref_count =*/ 0,
54+
/*.mtl_residency_set =*/ { nil },
55+
/*.mtl_residency_set_n =*/ 0,
4956
/*.has_simdgroup_reduction =*/ false,
5057
/*.has_simdgroup_mm =*/ false,
5158
/*.has_bfloat =*/ false,
@@ -95,6 +102,41 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
95102
}
96103
}
97104

105+
// add residency set
106+
static bool ggml_backend_metal_device_add_residency_set(struct ggml_backend_metal_device_context * ctx, id<MTLResidencySet> residency_set) {
107+
assert(ctx != NULL);
108+
assert(queue != nil);
109+
110+
if (ctx->mtl_residency_set_n >= GGML_METAL_MAX_RESIDENCY_SETS) {
111+
GGML_LOG_ERROR("%s: warning: maximum number of residency sets reached\n", __func__);
112+
return false;
113+
}
114+
115+
ctx->mtl_residency_set[ctx->mtl_residency_set_n++] = residency_set;
116+
117+
return true;
118+
}
119+
120+
// remove residency set
121+
static bool ggml_backend_metal_device_remove_residency_set(struct ggml_backend_metal_device_context * ctx, id<MTLResidencySet> residency_set) {
122+
assert(ctx != NULL);
123+
assert(residency_set != nil);
124+
125+
for (int i = 0; i < ctx->mtl_residency_set_n; ++i) {
126+
if (ctx->mtl_residency_set[i] == residency_set) {
127+
for (int j = i; j < ctx->mtl_residency_set_n - 1; ++j) {
128+
ctx->mtl_residency_set[j] = ctx->mtl_residency_set[j + 1];
129+
}
130+
131+
ctx->mtl_residency_set_n--;
132+
133+
return true;
134+
}
135+
}
136+
137+
return false;
138+
}
139+
98140
// kernels
99141

100142
struct ggml_metal_kernel {
@@ -483,6 +525,11 @@ @implementation GGMLMetalClass
483525
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
484526

485527
ctx->queue = [device newCommandQueue];
528+
if (ctx->queue == nil) {
529+
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
530+
return NULL;
531+
}
532+
486533
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
487534

488535
id<MTLLibrary> metal_library;
@@ -1035,6 +1082,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
10351082
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
10361083
int n_buffers;
10371084
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1085+
1086+
id<MTLResidencySet> residency_set;
10381087
};
10391088

10401089
// finds the Metal buffer that contains the tensor data on the GPU device
@@ -4039,6 +4088,23 @@ static enum ggml_status ggml_metal_graph_compute(
40394088
struct ggml_backend_metal_context * ctx = backend->context;
40404089
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
40414090

4091+
// attached residency sets to the queue on the first run
4092+
// also tested to attached them on each run, but it does not make a difference
4093+
static bool is_first = true;
4094+
if (is_first) {
4095+
is_first = false;
4096+
GGML_LOG_INFO("%s: adding %d residency sets\n", __func__, ctx_dev->mtl_residency_set_n);
4097+
[ctx->queue addResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
4098+
}
4099+
4100+
// this does not make a difference
4101+
//for (int i = 0; i < ctx_dev->mtl_residency_set_n; ++i) {
4102+
// GGML_LOG_INFO("%s: residency set %d allocations size = %zu\n", __func__, i, [ctx_dev->mtl_residency_set[i] allocatedSize]);
4103+
// [ctx_dev->mtl_residency_set[i] requestResidency];
4104+
//}
4105+
4106+
int64_t t_start_us = ggml_time_us();
4107+
40424108
// number of nodes encoded by the main thread (empirically determined)
40434109
const int n_main = 128;
40444110

@@ -4086,19 +4152,25 @@ static enum ggml_status ggml_metal_graph_compute(
40864152
// the main thread commits the first few commands immediately
40874153
// command_buffer[n_cb]
40884154
{
4089-
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
4155+
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
40904156
ctx->command_buffers[n_cb] = command_buffer;
40914157

4158+
// does not make a difference
4159+
[command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
4160+
40924161
[command_buffer enqueue];
40934162
ctx->encode_async(n_cb);
40944163
}
40954164

40964165
// prepare the rest of the command buffers asynchronously
40974166
// command_buffer[0.. n_cb)
40984167
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
4099-
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
4168+
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
41004169
ctx->command_buffers[cb_idx] = command_buffer;
41014170

4171+
// does not make a difference
4172+
[command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
4173+
41024174
// always enqueue the first two command buffers
41034175
// enqueue all of the command buffers if we don't need to abort
41044176
if (cb_idx < 2 || ctx->abort_callback == NULL) {
@@ -4163,6 +4235,10 @@ static enum ggml_status ggml_metal_graph_compute(
41634235
}
41644236
}
41654237

4238+
int64_t t_end_us = ggml_time_us();
4239+
4240+
GGML_LOG_DEBUG("%s: compute graph took %8.2f ms\n", __func__, (t_end_us - t_start_us) / 1000.0);
4241+
41664242
return GGML_STATUS_SUCCESS;
41674243
}
41684244

@@ -4176,6 +4252,13 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41764252
for (int i = 0; i < ctx->n_buffers; i++) {
41774253
[ctx->buffers[i].metal release];
41784254
}
4255+
4256+
ggml_backend_metal_device_remove_residency_set(buffer->buft->device->context, ctx->residency_set);
4257+
4258+
[ctx->residency_set endResidency];
4259+
[ctx->residency_set removeAllAllocations];
4260+
[ctx->residency_set release];
4261+
41794262
ggml_backend_metal_device_rel(buffer->buft->device->context);
41804263

41814264
if (ctx->owned) {
@@ -4284,7 +4367,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42844367
size_aligned += (size_page - (size_aligned % size_page));
42854368
}
42864369

4287-
id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
4370+
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
4371+
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
42884372

42894373
ctx->all_data = ggml_metal_host_malloc(size_aligned);
42904374
ctx->all_size = size_aligned;
@@ -4307,10 +4391,34 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43074391
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
43084392
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
43094393
free(ctx);
4310-
ggml_backend_metal_device_rel(buft->device->context);
4394+
ggml_backend_metal_device_rel(ctx_dev);
43114395
return NULL;
43124396
}
43134397

4398+
{
4399+
MTLResidencySetDescriptor * desc;
4400+
desc = [[MTLResidencySetDescriptor alloc] init];
4401+
desc.label = @"Primary residency set";
4402+
desc.initialCapacity = ctx->n_buffers;
4403+
4404+
NSError *error;
4405+
ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
4406+
if (error) {
4407+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
4408+
return NULL;
4409+
}
4410+
4411+
for (int i = 0; i < ctx->n_buffers; i++) {
4412+
[ctx->residency_set addAllocation:ctx->buffers[i].metal];
4413+
}
4414+
4415+
[ctx->residency_set commit];
4416+
[ctx->residency_set requestResidency];
4417+
4418+
// track the residency set in the device context
4419+
ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
4420+
}
4421+
43144422
//ggml_backend_metal_log_allocated_size(device, size_aligned);
43154423

43164424
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
@@ -4400,7 +4508,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44004508
size_aligned += (size_page - (size_aligned % size_page));
44014509
}
44024510

4403-
id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
4511+
struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4512+
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
44044513

44054514
// the buffer fits into the max buffer size allowed by the device
44064515
if (size_aligned <= device.maxBufferLength) {
@@ -4453,6 +4562,30 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44534562
}
44544563
}
44554564

4565+
{
4566+
MTLResidencySetDescriptor * desc;
4567+
desc = [[MTLResidencySetDescriptor alloc] init];
4568+
desc.label = @"Primary residency set";
4569+
desc.initialCapacity = ctx->n_buffers;
4570+
4571+
NSError *error;
4572+
ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
4573+
if (error) {
4574+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
4575+
return NULL;
4576+
}
4577+
4578+
for (int i = 0; i < ctx->n_buffers; i++) {
4579+
[ctx->residency_set addAllocation:ctx->buffers[i].metal];
4580+
}
4581+
4582+
[ctx->residency_set commit];
4583+
[ctx->residency_set requestResidency];
4584+
4585+
// track the residency set in the device context
4586+
ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
4587+
}
4588+
44564589
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
44574590
}
44584591

@@ -4766,6 +4899,30 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
47664899
}
47674900
}
47684901

4902+
{
4903+
MTLResidencySetDescriptor * desc;
4904+
desc = [[MTLResidencySetDescriptor alloc] init];
4905+
desc.label = @"Primary residency set";
4906+
desc.initialCapacity = ctx->n_buffers;
4907+
4908+
NSError *error;
4909+
ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
4910+
if (error) {
4911+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
4912+
return NULL;
4913+
}
4914+
4915+
for (int i = 0; i < ctx->n_buffers; i++) {
4916+
[ctx->residency_set addAllocation:ctx->buffers[i].metal];
4917+
}
4918+
4919+
[ctx->residency_set commit];
4920+
[ctx->residency_set requestResidency];
4921+
4922+
// track the residency set in the device context
4923+
ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
4924+
}
4925+
47694926
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
47704927
}
47714928

0 commit comments

Comments
 (0)