Skip to content

Commit

Permalink
fix metal build
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Oct 18, 2023
1 parent 38f61d9 commit 4b319cd
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 35 deletions.
30 changes: 15 additions & 15 deletions examples/gpt-2/main-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct gpt2_model {
//
struct ggml_context * ctx;

ggml_backend_t backends = NULL;
ggml_backend_t backend = NULL;

ggml_backend_buffer_t buffer_w;
ggml_backend_buffer_t buffer_kv;
Expand Down Expand Up @@ -238,8 +238,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backends = ggml_backend_cuda_init();
if (!model.backends) {
model.backend = ggml_backend_cuda_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
}
Expand All @@ -256,19 +256,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}
#endif

if (!model.backends) {
if (!model.backend) {
// fallback to CPU backend
fprintf(stderr, "%s: using CPU backend\n", __func__);
model.backends = ggml_backend_cpu_init();
model.backend = ggml_backend_cpu_init();
}

if (!model.backends) {
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
return false;
}

// allocate weights buffer
model.buffer_w = ggml_backend_alloc_buffer(model.backends, buffer_size);
model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);

// prepare memory for the weights
{
Expand Down Expand Up @@ -357,7 +357,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);

// create a backend buffer (can be in host or device memory)
model.buffer_kv = ggml_backend_alloc_buffer(model.backends, memory_size + 256);
model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);

// allocate the tensors into the backend buffer
{
Expand Down Expand Up @@ -439,7 +439,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

ggml_allocr_alloc(alloc, tensor);

if (ggml_backend_is_cpu (model.backends)
if (ggml_backend_is_cpu (model.backend)
#ifdef GGML_USE_METAL
|| ggml_backend_is_metal(model.backend)
#endif
Expand Down Expand Up @@ -799,15 +799,15 @@ bool gpt2_eval(
ggml_allocr_alloc_graph(allocr, gf);

// run the computation
if (ggml_backend_is_cpu(model.backends)) {
ggml_backend_cpu_set_n_threads(model.backends, n_threads);
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(model.backend)) {
ggml_backend_metal_set_n_cb(model.backend, n_threads);
}
#endif
ggml_backend_graph_compute(model.backends, gf);
ggml_backend_graph_compute(model.backend, gf);

//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
Expand Down Expand Up @@ -876,7 +876,7 @@ int main(int argc, char ** argv) {
// allocate the compute buffer
{
// alignment required by the backend
size_t align = ggml_backend_get_alignment(model.backends);
size_t align = ggml_backend_get_alignment(model.backend);
allocr = ggml_allocr_new_measure(align);

// create the worst case graph for memory usage estimation
Expand All @@ -889,7 +889,7 @@ int main(int argc, char ** argv) {

// recreate the allocator with the required memory
ggml_allocr_free(allocr);
buf_compute = ggml_backend_alloc_buffer(model.backends, mem_size);
buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
allocr = ggml_allocr_new_from_buffer(buf_compute);

fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
Expand Down Expand Up @@ -993,7 +993,7 @@ int main(int argc, char ** argv) {
ggml_backend_buffer_free(model.buffer_w);
ggml_backend_buffer_free(model.buffer_kv);
ggml_backend_buffer_free(buf_compute);
ggml_backend_free(model.backends);
ggml_backend_free(model.backend);

return 0;
}
23 changes: 7 additions & 16 deletions examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,18 +119,21 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
gpu_backend = ggml_backend_metal_init();
if (gpu_backend) {
if (!gpu_backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
} else {
ggml_backend_metal_set_n_cb(gpu_backend, params.n_threads);
}
}
#endif

if (gpu_backend) {
model.backends.push_back(gpu_backend);
}

// always add the CPU backend as a fallback
model.backends.push_back(ggml_backend_cpu_init());
ggml_backend_t cpu_backend = ggml_backend_cpu_init();
ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads);
model.backends.push_back(cpu_backend);
}

// load the model's weights from a file
Expand Down Expand Up @@ -874,7 +877,6 @@ struct ggml_cgraph * gpt2_graph(
bool gpt2_eval(
const gpt2_model & model,
ggml_backend_sched_t sched,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w) {
Expand All @@ -889,17 +891,6 @@ bool gpt2_eval(
// allocate tensors

// run the computation
#if 0
ggml_backend_t backend = model.backends.front();
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
#endif
ggml_backend_sched_graph_compute(sched, gf);

//if (n_past%100 == 0) {
Expand Down Expand Up @@ -1020,7 +1011,7 @@ int main(int argc, char ** argv) {
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();

if (!gpt2_eval(model, sched, params.n_threads, n_past, embd, logits)) {
if (!gpt2_eval(model, sched, n_past, embd, logits)) {
printf("Failed to predict\n");
return 1;
}
Expand Down
4 changes: 2 additions & 2 deletions src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

//#define GGML_ALLOCATOR_DEBUG

#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
//#define AT_PRINTF(...) ((void)0)
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
#define AT_PRINTF(...)

// TODO: GGML_PAD ?
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
Expand Down
4 changes: 2 additions & 2 deletions src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cp
// scheduler

#define GGML_MAX_BACKENDS 4
#define GGML_MAX_SPLITS 64
#define GGML_MAX_SPLITS 256
#define GGML_MAX_SPLIT_INPUTS 16

struct ggml_backend_sched_split {
Expand Down Expand Up @@ -733,7 +733,7 @@ static void sched_split_graph(ggml_backend_sched_t sched) {
sched->splits[cur_split].i_end = graph->n_nodes;
sched->n_splits = cur_split + 1;

fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);

#if 1
// sanity check: all sources should have the same backend as the node
Expand Down

0 comments on commit 4b319cd

Please sign in to comment.