From 130adf841580b21398ce0de1aaa4ab1add6e9978 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 01/48] Introduce ggml_compute_threadpool

- OpenMP functional: check
- Vanilla ggml functional: Check
- ggml w/threadpool functional: Check
- OpenMP no regression: No glaring problems
- Vanilla ggml no regression: No glaring problems
- ggml w/threadpool no regression: No glaring problems
---
 common/common.cpp                             | 294 ++++++-
 common/common.h                               |  29 +-
 examples/CMakeLists.txt                       |   2 +-
 examples/baby-llama/baby-llama.cpp            |   2 +-
 examples/benchmark/benchmark-matmult.cpp      |   2 +-
 .../cvector-generator/cvector-generator.cpp   |   4 +-
 examples/export-lora/export-lora.cpp          |   2 +-
 examples/llama-bench/llama-bench.cpp          |  51 ++
 examples/llava/llava-cli.cpp                  |   4 +-
 examples/main/main.cpp                        |  30 +
 examples/server/server.cpp                    |   4 +-
 ggml/CMakeLists.txt                           |   2 +-
 ggml/include/ggml-alloc.h                     |   5 +-
 ggml/include/ggml-backend.h                   |   1 +
 ggml/include/ggml.h                           |  28 +-
 ggml/src/ggml-backend.c                       |  16 +-
 ggml/src/ggml.c                               | 800 ++++++++++++++----
 include/llama.h                               |  12 +
 src/llama.cpp                                 |  93 +-
 tests/test-rope.cpp                           |   2 +-
 20 files changed, 1168 insertions(+), 215 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 715adf94658f0..6c927fc17890b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -277,6 +277,36 @@ void gpt_params_handle_model_default(gpt_params & params) {
     }
 }
 
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+    int32_t n_set = 0;
+
+    if (cpuparams.n_threads < 0) {
+        // Assuming everything about cpuparams is invalid
+        if (role_model != nullptr) {
+            cpuparams = *role_model;
+        } else {
+            cpuparams.n_threads = std::thread::hardware_concurrency();
+        }
+    }
+
+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (cpuparams.cpumask[i]) {
+            n_set++;
+        }
+    }
+
+    if (n_set == 0) {
+        // You hit the jackpot!
+        memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS);
+        n_set = GGML_MAX_N_THREADS;
+    }
+
+    if (n_set < cpuparams.n_threads) {
+        // Not enough set bits, may experience performance issues.
+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+    }
+}
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
@@ -296,6 +326,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
     if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
@@ -331,7 +366,7 @@ void gpt_params_parse_from_env(gpt_params & params) {
     get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
     get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
     get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
-    get_env("LLAMA_ARG_THREADS",          params.n_threads);
+    get_env("LLAMA_ARG_THREADS",          params.cpuparams.n_threads);
     get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
     get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
     get_env("LLAMA_ARG_BATCH",            params.n_batch);
@@ -368,6 +403,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    size_t dash_loc = range.find('-');
+    if (dash_loc == std::string::npos) {
+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        return false;
+    }
+
+    size_t start_i;
+    size_t end_i;
+
+    if (dash_loc == 0) {
+        start_i = 0;
+    } else {
+        start_i = std::stoull(range.substr(0, dash_loc));
+        if (start_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "Start index out of bounds!\n");
+            return false;
+        }
+    }
+
+    if (dash_loc == range.length() - 1) {
+        end_i = GGML_MAX_N_THREADS - 1;
+    } else {
+        end_i = std::stoull(range.substr(dash_loc + 1));
+        if (end_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "End index out of bounds!\n");
+            return false;
+        }
+    }
+
+    for (size_t i = start_i; i <= end_i; i++) {
+        boolmask[i] = true;
+    }
+
+    return true;
+}
+
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    // Discard potential 0x prefix
+    size_t start_i = 0;
+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+        start_i = 2;
+    }
+
+    size_t num_digits = mask.length() - start_i;
+    if (num_digits > 128) num_digits = 128;
+
+    size_t end_i = num_digits + start_i;
+
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
+        int8_t id = c;
+
+        if ((c >= '0' && c <= '9')) {
+            id -= '0';
+        } else if (c >= 'a' && c <= 'f') {
+            id -= 'a' - 10;
+        } else if (c >= 'A' && c <= 'F') {
+            id -= 'A' - 10;
+        } else {
+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            return false;
+        }
+
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+    }
+
+    return true;
+}
+
 #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
 
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@@ -384,36 +492,137 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "-t" || arg == "--threads") {
         CHECK_ARG
-        params.n_threads = std::stoi(argv[i]);
-        if (params.n_threads <= 0) {
-            params.n_threads = std::thread::hardware_concurrency();
+        params.cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams.n_threads <= 0) {
+            params.cpuparams.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-C" || arg == "--cpu-mask") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Cr" || arg == "--cpu-range") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio") {
+        CHECK_ARG
+        params.cpuparams.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict") {
+        params.cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll") {
+        params.cpuparams.poll = true;
+        return true;
+    }
     if (arg == "-tb" || arg == "--threads-batch") {
         CHECK_ARG
-        params.n_threads_batch = std::stoi(argv[i]);
-        if (params.n_threads_batch <= 0) {
-            params.n_threads_batch = std::thread::hardware_concurrency();
+        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams_batch.n_threads <= 0) {
+            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "-Crb" || arg == "--cpu-range_batch") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch") {
+        CHECK_ARG
+        params.cpuparams_batch.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch") {
+        params.cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch") {
+        params.cpuparams_batch.poll = true;
+        return true;
+    }
     if (arg == "-td" || arg == "--threads-draft") {
         CHECK_ARG
-        params.n_threads_draft = std::stoi(argv[i]);
-        if (params.n_threads_draft <= 0) {
-            params.n_threads_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams.n_threads <= 0) {
+            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
         }
         return true;
+    }
+        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Crd" || arg == "--cpu-range-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio-draft") {
+        CHECK_ARG
+        params.draft_cpuparams.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-draft") {
+        params.draft_cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-draft") {
+        params.draft_cpuparams.poll = true;
+        return true;
     }
     if (arg == "-tbd" || arg == "--threads-batch-draft") {
         CHECK_ARG
-        params.n_threads_batch_draft = std::stoi(argv[i]);
-        if (params.n_threads_batch_draft <= 0) {
-            params.n_threads_batch_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams_batch.n_threads <= 0) {
+            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch-draft") {
+        CHECK_ARG
+        params.draft_cpuparams_batch.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch-draft") {
+        params.draft_cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch-draft") {
+        params.draft_cpuparams_batch.poll = true;
+        return true;
+    }
     if (arg == "-p" || arg == "--prompt") {
         CHECK_ARG
         params.prompt = argv[i];
@@ -1498,11 +1707,38 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
     options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
+    options.push_back({ "*",           "-C,    --cpu-mask M",           "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
+    options.push_back({ "*",           "       --cpu-strict",           "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
+    options.push_back({ "*",           "       --poll",                 "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
+                                                                        "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
+    options.push_back({ "*",           "       --cpu-strict-batch",     "use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --priority-batch N",     "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
+    options.push_back({ "*",           "       --poll-batch",           "use polling to wait for work (default: --poll)"});
     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",     "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi",
+                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
+    options.push_back({ "speculative", "       --cpu-strict-draft",     "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --priority-draft N",     "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
+    options.push_back({ "speculative", "       --poll-draft",           "Use polling to wait for draft model work (default: same as --poll])"});
     options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
                                                                         "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M",
+                                                                        "Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
+    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
+                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
+    options.push_back({ "speculative", "       --cpu-strict-batch-draft",
+                                                                        "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
+    options.push_back({ "speculative", "       --priority-batch-draft N",
+                                                                        "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft",     "Use polling to wait for draft model work (default: --poll-draft)"});
+
     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
     options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
@@ -1774,7 +2010,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
     options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
     options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
     options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
 
     printf("usage: %s [options]\n", argv[0]);
@@ -1806,9 +2041,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 std::string gpt_params_get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
+    if (params.cpuparams_batch.n_threads != -1) {
+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
     }
 #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
     // TODO: windows + arm64 + mingw64
@@ -2332,8 +2567,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_seq_max         = params.n_parallel;
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
@@ -2359,6 +2595,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     return cparams;
 }
 
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+    struct ggml_threadpool_params tpp;
+
+    tpp.mask_specified = params.mask_valid;
+    if (params.mask_valid) {
+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
+    }
+
+    tpp.n_threads  = params.n_threads;
+    tpp.prio       = params.priority;
+    tpp.poll       = params.poll;
+    tpp.strict_cpu = params.strict_cpu;
+
+    return tpp;
+}
+
 #ifdef LLAMA_USE_CURL
 
 static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -3348,7 +3600,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h
index f603ba2be1d35..0f0346065b425 100644
--- a/common/common.h
+++ b/common/common.h
@@ -67,13 +67,18 @@ enum dimre_method {
     DIMRE_METHOD_MEAN,
 };
 
+struct cpu_params {
+    int32_t  n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    bool     poll                        = false;   // Use polling (busywait) to wait for work
+};
+
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
-    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -100,6 +105,11 @@ struct gpt_params {
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
 
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    struct cpu_params draft_cpuparams;
+    struct cpu_params draft_cpuparams_batch;
+
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
 
@@ -204,7 +214,7 @@ struct gpt_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";
@@ -277,6 +287,10 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
 
 std::string gpt_params_get_system_info(const gpt_params & params);
 
+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+
 //
 // String utils
 //
@@ -327,8 +341,9 @@ struct llama_init_result {
 
 struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
 
-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 67b3d27747850..247d52c6d3454 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,6 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    add_subdirectory(speculative)
+    #add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index aca332e9464d2..3ce91070b4ed7 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
 #endif
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 47cb16c69d536..e78f6b388ef6e 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -21,7 +21,7 @@
 #endif
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 8fa492571aa44..a68268388389d 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
     if (use_pca) {
         // run PCA
         PCA::pca_params pca_params;
-        pca_params.n_threads = params.n_threads;
-        pca_params.n_batch = params.n_pca_batch;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
         pca_params.n_iterations = params.n_pca_iterations;
         PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
     } else {
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index c7e5ca78845ee..8df457e219493 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
 
     g_verbose = (params.verbosity == 1);
     try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
         ctx.run_merge();
     } catch (const std::exception & err) {
         fprintf(stderr, "%s\n", err.what());
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 42918bfc79f22..5a929ceddafbe 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -235,6 +235,7 @@ struct cmd_params {
     std::vector<bool> use_mmap;
     std::vector<bool> embeddings;
     ggml_numa_strategy numa;
+    cpu_params cpuparams;
     int reps;
     bool verbose;
     output_formats output_format;
@@ -261,6 +262,7 @@ static const cmd_params cmd_params_defaults = {
     /* use_mmap             */ {true},
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* cpuparams            */ {},
     /* reps                 */ 5,
     /* verbose              */ false,
     /* output_format        */ MARKDOWN,
@@ -289,6 +291,11 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
+    printf("  -mt, --max-threads <n>              (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
+    printf("  -C, --cpu-mask <hex>                (default: 0x0)\n");
+    printf("  --cpu-strict <0|1>                  (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
+    printf("  --priority <0|1|2|3>                (default: %d)\n", cmd_params_defaults.cpuparams.priority);
+    printf("  --poll <0|1>                        (default: %d)\n", cmd_params_defaults.cpuparams.poll);
     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
@@ -492,6 +499,30 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
                 else { invalid_param = true; break; }
             }
+        } else if (arg == "-mt" || arg == "--max-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string mask = argv[i];
+            params.cpuparams.mask_valid = true;
+            invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.priority = std::stoul(argv[i]);
+        } else if (arg == "--cpu-strict") {
+            params.cpuparams.strict_cpu = true;
+        } else if (arg == "--poll") {
+            params.cpuparams.poll = true;
         } else if (arg == "-fa" || arg == "--flash-attn") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1402,6 +1433,23 @@ int main(int argc, char ** argv) {
     llama_model * lmodel = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
+    postprocess_cpu_params(params.cpuparams);
+
+    struct ggml_threadpool_params tpp;
+    tpp.n_threads      = params.cpuparams.n_threads;
+    tpp.mask_specified = params.cpuparams.mask_valid;
+    tpp.strict_cpu     = params.cpuparams.strict_cpu;
+    tpp.prio           = params.cpuparams.priority;
+    tpp.poll           = params.cpuparams.poll;
+
+    std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+
+    struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
     for (const auto & inst : params_instances) {
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1427,6 +1475,7 @@ int main(int argc, char ** argv) {
         test t(inst, lmodel, ctx);
 
         llama_kv_cache_clear(ctx);
+        llama_attach_threadpool(ctx, threadpool);
 
         // warmup run
         if (t.n_prompt > 0) {
@@ -1468,6 +1517,8 @@ int main(int argc, char ** argv) {
         llama_free(ctx);
     }
 
+    ggml_release_threadpool(threadpool);
+
     llama_free_model(lmodel);
 
     if (p) {
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 8c7dd2ae3d0dc..86b39f20eea6e 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
         if (!params->image.empty()) {
             LOG_TEE("using base64 encoded image instead of command line image path\n");
         }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
         if (!embed) {
             LOG_TEE("%s: can't load image from prompt\n", __func__);
             return NULL;
         }
         params->prompt = remove_image_from_prompt(prompt);
     } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
         if (!embed) {
             fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
             return NULL;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 4a342ad031663..1628a42a9bd7b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -221,6 +221,33 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    LOG("%s: llama threadpool init = n_threads = %d\n",
+        __func__,
+        (int32_t) params.cpuparams.n_threads
+    );
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
+    if (!threadpool_batch) {
+        LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+        exit(1);
+    }
+    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_batch_threadpool(ctx, threadpool_batch);
+    llama_attach_threadpool(ctx, threadpool);
+    if (ctx_guidance) {
+        llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
+        llama_attach_threadpool(ctx_guidance, threadpool);
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
@@ -989,6 +1016,9 @@ int main(int argc, char ** argv) {
     llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
+    ggml_release_threadpool(threadpool);
+    ggml_release_threadpool(threadpool_batch);
+
 #ifndef LOG_DISABLE_LOGS
     LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e79e7aa2cb846..06b782ca62ad5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) {
     });
 
     LOG_INFO("system info", {
-        {"n_threads",       params.n_threads},
-        {"n_threads_batch", params.n_threads_batch},
+        {"n_threads",       params.cpuparams.n_threads},
+        {"n_threads_batch", params.cpuparams_batch.n_threads},
         {"total_threads",   std::thread::hardware_concurrency()},
         {"system_info",     llama_print_system_info()},
     });
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index cc16858849783..a5a8c5d8c2887 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -146,7 +146,7 @@ option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"
 set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "ggml: metal minimum macOS version")
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
-option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_OPENMP                          "ggml: use OpenMP"                                OFF)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 434c13b34a929..cd85b6ee70560 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,8 +7,9 @@ extern "C" {
 #endif
 
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
+typedef struct  ggml_compute_threadpool * ggml_compute_threadpool_t;
 
 // Tensor allocator
 struct ggml_tallocr {
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 5f3f1e286990e..c59f9f54a44b9 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -102,6 +102,7 @@ extern "C" {
 
     GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
     GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
     GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
     // Create a backend buffer from an existing pointer
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index a7e9dc9b2ff63..9b29a3af72d81 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -231,6 +231,8 @@
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
+#define GGML_MAX_N_THREADS      512
+
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@@ -624,6 +626,17 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
+    struct ggml_threadpool_params {
+        bool    cpumask[GGML_MAX_N_THREADS];
+        bool    mask_specified;
+        int32_t n_threads;
+        int32_t prio;
+        bool    poll;
+        bool    strict_cpu;
+    };
+
+    struct ggml_compute_threadpool;     // forward declaration, see ggml.c
+
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
@@ -631,6 +644,7 @@ extern "C" {
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
         int n_threads;
+        struct ggml_compute_threadpool * threadpool;
 
         // abort ggml_graph_compute when true
         ggml_abort_callback abort_callback;
@@ -2010,10 +2024,20 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
+    GGML_API int32_t                         ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
+    GGML_API void                            ggml_pause_threadpool        (struct ggml_compute_threadpool * threadpool);
+    GGML_API void                            ggml_resume_threadpool       (struct ggml_compute_threadpool * threadpool);
+
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+            struct ggml_compute_threadpool * threadpool /* = NULL */ );
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
     GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 8856967c91104..826b99ac01ace 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -722,7 +722,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 #endif
 
 struct ggml_backend_cpu_context {
-    int n_threads;
+    int                       n_threads;
+    ggml_compute_threadpool_t threadpool;
+
     void * work_data;
     size_t work_size;
 
@@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
 
     if (cpu_plan->cplan.work_size > 0) {
@@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
     if (cpu_ctx->work_size < cplan.work_size) {
         free(cpu_ctx->work_data);
@@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     }
 
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
     ctx->work_data           = NULL;
     ctx->work_size           = 0;
     ctx->abort_callback      = NULL;
@@ -903,6 +906,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->threadpool = threadpool;
+}
+
 void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index e52471ce3f861..7d8d179186306 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1868,28 +1868,104 @@ struct ggml_context_container {
     struct ggml_context context;
 };
 
-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan * cplan;
+//
+// Threading defs
+//
+
+typedef pthread_t          ggml_thread_t;
+
+#if defined(_WIN32)
+
+typedef CONDITION_VARIABLE ggml_cond_t;
+typedef SRWLOCK            ggml_mutex_t;
+
+#define ggml_mutex_init(m)   InitializeSRWLock(m)
+#define ggml_mutex_destroy(m)
+#define ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
+#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
+#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+
+#define ggml_cond_init(c)    InitializeConditionVariable(c)
+#define ggml_cond_destroy(c)
+#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
+
+typedef pthread_cond_t     ggml_cond_t;
+typedef pthread_mutex_t    ggml_mutex_t;
 
-    int n_threads;
+#define ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
+#define ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
+#define ggml_mutex_lock(m)          pthread_mutex_lock(m)
+#define ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
+#define ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
+#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+#define ggml_cond_init(c)      pthread_cond_init(c, NULL)
+#define ggml_cond_destroy(c)   pthread_cond_destroy(c)
+#define ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
+#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
+// Threadpool def
+struct ggml_compute_threadpool {
+    ggml_mutex_t mutex;       // mutex for cond.var
+    ggml_cond_t  cond;        // cond.var for waiting for new work
+
+    struct ggml_cgraph * cgraph;
+    struct ggml_cplan  * cplan;
 
     // synchronization primitives
     atomic_int n_barrier;
     atomic_int n_barrier_passed;
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+
+    volatile bool stop;      // Used for stopping the threadpool altogether
+    volatile bool pause;     // Used for pausing the threadpool or individual threads
+    volatile bool new_work;  // Set when there is work to be done, unset after it's done
+
+    struct ggml_compute_state * workers;   // per thread state
+    int32_t                     n_threads_max; // number of threads in the pool
+    int32_t                     n_threads_cur; // number of threads used in the current graph
+
+    int32_t      prio;       // Scheduling priority
+    bool         disposable; // Doesn't initialize a conv-var
+    bool         poll;       // Use polling (busywait)  // TODO
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
 
-    atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads
-
     enum ggml_status ec;
 };
 
+// Per-thread state
 struct ggml_compute_state {
+#ifndef GGML_USE_OPENMP
     ggml_thread_t thrd;
+    bool cpumask[GGML_MAX_N_THREADS];
+    bool mask_specified;
+#endif
+    struct ggml_compute_threadpool * threadpool;
     int ith;
-    struct ggml_compute_state_shared * shared;
 };
 
 struct ggml_compute_params {
@@ -1900,7 +1976,7 @@ struct ggml_compute_params {
     size_t wsize;
     void * wdata;
 
-    struct ggml_compute_state_shared * shared;
+    struct ggml_compute_threadpool * threadpool;
 };
 
 //
@@ -2995,23 +3071,23 @@ inline static void ggml_critical_section_start(void) {
 }
 
 #ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
 
     #pragma omp barrier
 }
 #else
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
 
-    atomic_int * n_barrier = &shared->n_barrier;
-    atomic_int * n_barrier_passed = &shared->n_barrier_passed;
+    atomic_int * n_barrier = &threadpool->n_barrier;
+    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
 
-    int n_threads = shared->n_threads;
+    int n_threads = threadpool->n_threads_cur;
     int passed_old = atomic_load(n_barrier_passed);
 
     if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
@@ -9993,7 +10069,7 @@ static void ggml_compute_forward_acc_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     const int ith = params->ith;
@@ -12363,10 +12439,10 @@ UseGgmlGemm1:;
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&params->shared->current_chunk, nth);
+        atomic_store(&params->threadpool->current_chunk, nth);
     }
 
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
 #if GGML_USE_LLAMAFILE
     if (src1->type != vec_dot_type) {
@@ -12474,7 +12550,7 @@ UseGgmlGemm2:;
             break;
         }
 
-        current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+        current_chunk = atomic_fetch_add(&params->threadpool->current_chunk, 1);
     }
 }
 
@@ -12569,7 +12645,7 @@ static void ggml_compute_forward_mul_mat_id(
         }
     }
 
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // compute each matrix multiplication in sequence
     for (int cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -12723,7 +12799,7 @@ static void ggml_compute_forward_out_prod_f32(
     if (ith == 0) {
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // dst[:,:,:,:] = 0
     // for i2,i3:
@@ -12841,7 +12917,7 @@ static void ggml_compute_forward_out_prod_q_f32(
     if (ith == 0) {
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // parallelize by last three dimensions
 
@@ -13027,7 +13103,7 @@ static void ggml_compute_forward_set_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     const int ith = params->ith;
@@ -13606,7 +13682,7 @@ static void ggml_compute_forward_diag_mask_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     // TODO: handle transposed/permuted matrices
@@ -14382,7 +14458,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 
@@ -14470,7 +14546,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 
@@ -14757,7 +14833,7 @@ static void ggml_compute_forward_conv_transpose_2d(
 
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t stride = ggml_get_op_params_i32(dst, 0);
 
@@ -15503,7 +15579,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
     if (ith == 0) {
         memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int64_t elem_q = ggml_nelements(q);
     const int64_t elem_k = ggml_nelements(k);
@@ -16194,7 +16270,7 @@ static void ggml_compute_forward_add_rel_pos_f32(
         if (params->ith == 0) {
             memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
     // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
 
@@ -16479,7 +16555,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     if (ith == 0) {
         memset(sums, 0, sizeof(float) * (nth + nth * nc));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const double eps = 1e-9;
 
@@ -16527,7 +16603,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         }
 #endif
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     if (ith == 0) {
         float * dp = (float *) dst->data;
@@ -18268,65 +18344,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
     ggml_hash_set_reset(&cgraph->visited_hash_set);
 }
 
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
@@ -18603,9 +18620,292 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
     return n_tasks;
 }
 
-struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
+
+enum {
+    SCHED_PRIO_NORMAL,
+    SCHED_PRIO_MEDIUM,
+    SCHED_PRIO_HIGH,
+    SCHED_PRIO_REALTIME
+};
+
+#if defined(_WIN32)
+#include "windows.h"
+
+// TODO: support > 64 CPUs
+static bool __thread_affinity(bool * mask) {
+    HANDLE    h = GetCurrentThread();
+    uint64_t  bitmask = 0ULL;
+
+    assert(GGML_MAX_N_THREADS >= 64);
+
+    for (int32_t i = 0; i < 8; i++) {
+        int32_t idx = i * 8;
+        uint8_t val = 0;
+        val |= mask[idx + 0] << 0;
+        val |= mask[idx + 1] << 1;
+        val |= mask[idx + 2] << 2;
+        val |= mask[idx + 3] << 3;
+        val |= mask[idx + 4] << 4;
+        val |= mask[idx + 5] << 5;
+        val |= mask[idx + 6] << 6;
+        val |= mask[idx + 7] << 7;
+        bitmask |= (uint64_t)val << idx;
+    }
+
+    for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+            break;
+        }
+    }
+
+    DWORD_PTR m = (DWORD_PTR)bitmask;
+
+    m = SetThreadAffinityMask(h, m);
+
+    return m != 0;
+}
+
+static bool __process_priority(int32_t prio) {
+    DWORD p = NORMAL_PRIORITY_CLASS;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+
+    return SetPriorityClass(GetCurrentProcess(), p);
+}
+
+static bool __thread_priority(int32_t prio) {
+    DWORD p = NORMAL_PRIORITY_CLASS;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    }
+
+    return SetThreadPriority(GetCurrentThread(), p);
+
+}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+
+static bool __thread_affinity(const bool * mask) {
+    UNUSED(mask);
+    return true;
+}
+
+static bool __process_priority(int32_t prio) {
+    int32_t p = 0;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p =  0;  break;
+        case SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case SCHED_PRIO_HIGH:     p = -10; break;
+        case SCHED_PRIO_REALTIME: p = -20; break;
+    }
+
+    int32_t r = setpriority(PRIO_PROCESS, 0, p);
+    return r != -1;
+}
+
+static bool __thread_priority(int32_t prio) {
+    UNUSED(prio);
+    return true;
+}
+
+#else // posix?
+
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+#include <sched.h>
+
+static bool __thread_affinity(const bool * mask) {
+    cpu_set_t cpuset;
+    int32_t err;
+
+    CPU_ZERO(&cpuset);
+
+    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            CPU_SET(i, &cpuset);
+        }
+    }
+
+#ifdef __ANDROID__
+    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+    if (err < 0) {
+        err = errno;
+    }
+#else
+    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set affinity mask 0x%llx (err %d: %s)\n", (unsigned long long)mask, err, strerror(err));
+        return false;
+    }
+
+    return true;
+}
+
+static bool __process_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    int32_t err = sched_setscheduler(0, policy, &p);
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set process priority %d (err %d)\n", prio, err);
+        return false;
+    }
+
+    return true;
+}
+
+static bool __thread_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set thread priority %d (err %d)\n", prio, err);
+        return false;
+    }
+
+    return true;
+}
+
+#endif
+
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void __cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void __cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void __cpu_relax(void) {;}
+#endif
+
+static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+    if (!global_mask) {
+        memset(local_mask, 1, GGML_MAX_N_THREADS);
+        return;
+    }
+    if (!strict) {
+        memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
+        return;
+    } else {
+        memset(local_mask, 0, GGML_MAX_N_THREADS);
+        int32_t base_idx = *iter;
+        for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+            int32_t idx = base_idx + i;
+            if (idx >= GGML_MAX_N_THREADS) {
+                // Just a cheaper modulo
+                idx -= GGML_MAX_N_THREADS;
+            }
+            if (global_mask[idx]) {
+                local_mask[idx] = 1;
+                *iter = idx + 1;
+                return;
+            }
+        }
+    }
+}
+
+void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
+    if (!threadpool) return;
+
+#ifndef GGML_USE_OPENMP
+    struct ggml_compute_state* workers = threadpool->workers;
+    const int32_t n_threads = threadpool->n_threads_max;
+
+    if (!threadpool->disposable) {
+        ggml_mutex_lock(&threadpool->mutex);
+    }
+    threadpool->n_threads_cur = n_threads;
+    threadpool->stop = true;
+    threadpool->pause = false;
+    if (!threadpool->disposable) {
+        ggml_cond_broadcast(&threadpool->cond);
+        ggml_mutex_unlock(&threadpool->mutex);
+    }
+
+    for (int32_t j = 1; j < n_threads; j++) {
+        int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
+        GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
+        UNUSED(rc);
+    }
+
+    GGML_ALIGNED_FREE(workers);
+
+    if (!threadpool->disposable) {
+        ggml_mutex_destroy(&threadpool->mutex);
+        ggml_cond_destroy(&threadpool->cond);
+    }
+#endif // GGML_USE_OPENMP
+
+    GGML_ALIGNED_FREE(threadpool);
+}
+
+void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    GGML_ASSERT(!threadpool->disposable);
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    GGML_ASSERT(!threadpool->disposable);
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+
+    ggml_mutex_lock(&threadpool->mutex);
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+struct ggml_cplan ggml_graph_plan(
+          const struct ggml_cgraph * cgraph,
+                           int32_t   n_threads,
+    struct ggml_compute_threadpool * threadpool) {
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool\n");
+    }
     if (n_threads <= 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
     }
 
     size_t work_size = 0;
@@ -18761,12 +19061,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     }
 
     if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+        work_size += CACHE_LINE_SIZE*(n_threads);
     }
 
-    cplan.n_threads = MIN(max_tasks, n_threads);
-    cplan.work_size = work_size;
-    cplan.work_data = NULL;
+    cplan.threadpool = threadpool;
+    cplan.n_threads  = MIN(max_tasks, n_threads);
+    cplan.work_size  = work_size;
+    cplan.work_data  = NULL;
 
     return cplan;
 }
@@ -18774,36 +19075,206 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
-    const struct ggml_cgraph * cgraph = state->shared->cgraph;
-    const struct ggml_cplan  * cplan  = state->shared->cplan;
+    const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
+    const struct ggml_cplan  * cplan  = state->threadpool->cplan;
 
     set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
-        /*.ith   =*/ state->ith,
-        /*.nth   =*/ state->shared->n_threads,
-        /*.wsize =*/ cplan->work_size,
-        /*.wdata =*/ cplan->work_data,
-        /*.shared=*/ state->shared,
+        /*.ith       =*/ state->ith,
+        /*.nth       =*/ state->threadpool->n_threads_cur,
+        /*.wsize     =*/ cplan->work_size,
+        /*.wdata     =*/ cplan->work_data,
+        /*.threadpool=*/ state->threadpool,
     };
 
-    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];
+    struct ggml_tensor * node = cgraph->nodes[0];
+
+    ggml_compute_forward(&params, node);
+    if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+        state->threadpool->ec = GGML_STATUS_ABORTED;
+    }
+
+    for (int node_n = 1; node_n < cgraph->n_nodes; node_n++) {
+        ggml_barrier(state->threadpool);
 
+        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
+            break;
+        }
+
+        node = cgraph->nodes[node_n];
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->ec = GGML_STATUS_ABORTED;
+            state->threadpool->ec = GGML_STATUS_ABORTED;
         }
+    }
 
-        ggml_barrier(state->shared);
+    if (!state->threadpool->disposable && state->ith == 0) {
+        state->threadpool->new_work = false;
+    }
 
-        if (state->shared->ec != GGML_STATUS_SUCCESS) {
-            break;
+    ggml_barrier(state->threadpool);
+
+    return 0;
+}
+
+
+
+#ifndef GGML_USE_OPENMP
+
+static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    do {
+        if (threadpool->poll) {
+            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
+                // No new work. Yield and keep polling.
+                //__cpu_relax();
+            }
+        } else {
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
+                // No new work. Wait for the signal.
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+    } while (state->ith >= threadpool->n_threads_cur);
+    return threadpool->new_work;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    GGML_ASSERT(!threadpool->disposable);
+
+    __thread_priority(threadpool->prio);
+    if (state->mask_specified)
+        __thread_affinity(state->cpumask);
+
+    while (true) {
+        // Check if we need to sleep
+        while (threadpool->pause) {
+            GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            if (threadpool->pause) {
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+        // This needs to be checked for after the cond_wait
+        if (threadpool->stop) break;
+
+        // Check if there is new work
+        // The main thread is the only one that can dispatch new work
+
+        bool new_work = ggml_graph_compute_check_for_work(state);
+        if (new_work) {
+            int64_t ret = (int64_t) ggml_graph_compute_thread(state);
+            if (ret == GGML_EXIT_ABORTED)
+                return (thread_ret_t) ret;
+
+            if (ret != GGML_EXIT_SUCCESS && ret != GGML_EXIT_ABORTED) {
+                fprintf(stderr, "ggml_graph_compute_thread exited with an unexpected error: %lld\n", (long long int) ret);
+                GGML_ASSERT(false);
+            }
         }
     }
 
-    return 0;
+    return (thread_ret_t) 0;
+}
+
+#endif // GGML_USE_OPENMP
+
+static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
+    struct ggml_threadpool_params * tpp,
+                             bool   disposable,
+               struct ggml_cgraph * cgraph,
+                struct ggml_cplan * cplan) {
+
+    struct ggml_compute_threadpool * threadpool =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_threadpool));
+    {
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->stop             = false;
+        threadpool->pause            = disposable ? false : true;
+        threadpool->new_work         = false;
+        threadpool->workers          = NULL;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = disposable ? tpp->n_threads : 0;
+        threadpool->disposable       = disposable;
+        threadpool->poll             = tpp->poll;
+        threadpool->prio             = tpp->prio;
+
+        threadpool->abort_callback      = NULL;
+        threadpool->abort_callback_data = NULL;
+        threadpool->ec                  = GGML_STATUS_SUCCESS;
+    }
+
+#ifndef GGML_USE_OPENMP
+    if (!disposable) {
+        ggml_mutex_init(&threadpool->mutex);
+        ggml_cond_init(&threadpool->cond);
+    }
+#endif // GGML_USE_OPENMP
+
+    struct ggml_compute_state * workers =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_state) * tpp->n_threads);
+
+    threadpool->workers = workers;
+
+#ifdef GGML_USE_OPENMP
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j] = (struct ggml_compute_state) {
+            .threadpool     = threadpool,
+            .ith            = j
+        };
+    }
+#else  // Not using OPENMP
+    int32_t cpumask_iter = 0;
+
+    __process_priority(tpp->prio);
+    __thread_priority(tpp->prio);
+
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j] = (struct ggml_compute_state) {
+            .thrd           = 0,
+            .mask_specified = tpp->mask_specified,
+            .threadpool     = threadpool,
+            .ith            = j
+        };
+
+        if (tpp->mask_specified) {
+            __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+        }
+
+        // Disposable threadpools need to have a valid cplan and cgraph immediately.
+        thread_ret_t (*thread_entrypoint)(void*) = disposable ? ggml_graph_compute_thread : ggml_graph_compute_secondary_thread;
+        // Spin threads for all secondary workers
+        if (j > 0) {
+            int32_t rc = ggml_thread_create(
+                &workers[j].thrd,
+                NULL,
+                thread_entrypoint,
+                &workers[j]
+            );
+            GGML_ASSERT(rc == 0);
+        }
+    }
+#endif // GGML_USE_OPENMP
+
+    return threadpool;
+}
+
+struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) {
+    return ggml_create_threadpool_impl(tpp, false, NULL, NULL);
 }
 
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
@@ -18811,19 +19282,41 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     GGML_ASSERT(cplan->n_threads > 0);
     GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
 
-    int n_threads = cplan->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.n_threads               =*/ n_threads,
-        /*.n_barrier               =*/ 0,
-        /*.n_barrier_passed        =*/ 0,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
-        /*.current_chunk           =*/ 0,
-        /*.ec                      =*/ GGML_STATUS_SUCCESS,
-    };
+    int32_t n_threads                           = cplan->n_threads;
+    struct ggml_compute_threadpool * threadpool = cplan->threadpool;
+
+    bool disposable_threadpool = false;
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("NOTE: No threadpool was specified in this cplan. Will create a disposable threadpool\n");
+        disposable_threadpool = true;
+
+        struct ggml_threadpool_params ttp = {
+            .mask_specified = false,
+            .n_threads      = n_threads,
+            .prio           = 1,
+            .poll           = false,
+            .strict_cpu     = false
+        };
+
+        threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
+    } else if (n_threads > threadpool->n_threads_max) {
+        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+    }
+
+    // Set up work
+    threadpool->cgraph        = cgraph;
+    threadpool->cplan         = cplan;
+    threadpool->n_threads_cur = n_threads;
+
+    if (!disposable_threadpool) {
+        // Reset some of the paramters that need resetting
+        // No worker threads should be accessing the parameters below at this stage
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
 
 #ifdef GGML_USE_OPENMP
     if (n_threads > 1) {
@@ -18833,63 +19326,52 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             {
                 // update the number of threads from the actual number of threads that we got from OpenMP
                 n_threads = omp_get_num_threads();
-                state_shared.n_threads = n_threads;
+                threadpool->n_threads_cur = n_threads;
             }
 
             struct ggml_compute_state worker = {
-                .thrd   = 0,
-                .ith    = omp_get_thread_num(),
-                .shared = &state_shared,
+                .ith        = omp_get_thread_num(),
+                .threadpool = threadpool,
             };
             ggml_graph_compute_thread(&worker);
         }
     } else {
         struct ggml_compute_state worker = {
-            .thrd   = 0,
-            .ith    = 0,
-            .shared = &state_shared,
+            .ith        = 0,
+            .threadpool = threadpool,
         };
         ggml_graph_compute_thread(&worker);
     }
 #else
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
-    for (int j = 0; j < n_threads; ++j) {
-        workers[j] = (struct ggml_compute_state) {
-            .thrd   = 0,
-            .ith    = j,
-            .shared = &state_shared,
-        };
-    }
-
-    // create thread pool
-    for (int j = 1; j < n_threads; ++j) {
-        const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-        GGML_ASSERT(rc == 0);
-        UNUSED(rc);
-    }
-
-    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
+    if (!disposable_threadpool) {
+        // Update main thread affinity to match the current threadpool
+        if (threadpool->workers[0].mask_specified) {
+            __thread_affinity(threadpool->workers[0].cpumask);
+        }
 
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
+        threadpool->new_work = true;
+        if (!threadpool->poll) {
+            ggml_mutex_lock(&threadpool->mutex);
+            ggml_cond_broadcast(&threadpool->cond);
+            ggml_mutex_unlock(&threadpool->mutex);
         }
     }
+    // this is a work thread too
+    ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif
 
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
-    return state_shared.ec;
+    if (disposable_threadpool) {
+        ggml_release_threadpool(threadpool);
+    }
+
+    return threadpool->ec;
 }
 
 enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
 
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
 
@@ -19684,7 +20166,7 @@ static enum ggml_opt_result ggml_opt_adam(
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
@@ -20031,7 +20513,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         opt->iter = iter;
     }
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
diff --git a/include/llama.h b/include/llama.h
index 6cca6320b347d..b569c58e9ea5c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -428,6 +428,18 @@ extern "C" {
     //optional:
     LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
 
+    // Optional: an auto threadpool gets created in ggml if not passed explicitly
+    LLAMA_API void llama_attach_threadpool(
+               struct   llama_context * ctx,
+            ggml_compute_threadpool_t   threadpool);
+    LLAMA_API void llama_attach_batch_threadpool(
+               struct   llama_context * ctx,
+            ggml_compute_threadpool_t   threadpool);
+    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+    LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
+    LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
+
+
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 8d5f24783d6ab..695b5eb00df5e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3091,6 +3091,9 @@ struct llama_context {
 #endif
     ggml_backend_t backend_cpu = nullptr;
 
+    ggml_compute_threadpool_t threadpool       = nullptr;
+    ggml_compute_threadpool_t threadpool_batch = nullptr;
+
     bool has_evaluated_once = false;
 
     int64_t t_start_us;
@@ -15494,9 +15497,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
 }
 
 static void llama_graph_compute(
-        llama_context & lctx,
-          ggml_cgraph * gf,
-                  int   n_threads) {
+                  llama_context & lctx,
+                    ggml_cgraph * gf,
+                            int   n_threads,
+        ggml_compute_threadpool * threadpool) {
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(lctx.backend_metal)) {
         ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -15505,6 +15509,7 @@ static void llama_graph_compute(
 
     if (lctx.backend_cpu != nullptr) {
         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
 #ifdef GGML_USE_BLAS
@@ -15518,6 +15523,42 @@ static void llama_graph_compute(
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 }
 
+// Optionally swaps the batch and single-tok threadpools.
+// Returns the number of threads, and if a valid threadpool exists, returns it too.
+static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
+        llama_context & lctx,
+              int32_t   n_tokens) {
+
+    const auto & cparams = lctx.cparams;
+    int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+
+    ggml_compute_threadpool_t threadpool = nullptr;  // nullptr -> disposable threadpool
+
+    // A batch threadpool without a non-batch threadpool isn't supported.
+    GGML_ASSERT(!lctx.threadpool_batch || lctx.threadpool);
+
+    if (lctx.threadpool_batch && lctx.threadpool) {
+        // Switch between the 2 threadpools as needed
+        if (n_tokens > 1) {
+            ggml_pause_threadpool(lctx.threadpool);
+            ggml_resume_threadpool(lctx.threadpool_batch);
+            threadpool = lctx.threadpool_batch;
+            n_threads = cparams.n_threads_batch;
+        } else {
+            ggml_pause_threadpool(lctx.threadpool_batch);
+            ggml_resume_threadpool(lctx.threadpool);
+            threadpool = lctx.threadpool;
+            n_threads = cparams.n_threads;
+        }
+    } else if (lctx.threadpool) {
+        ggml_resume_threadpool(lctx.threadpool);
+        threadpool = lctx.threadpool;
+        n_threads = cparams.n_threads;
+    }
+    return std::make_pair(n_threads, threadpool);
+}
+
+
 // decode a batch of tokens by evaluating the transformer
 //
 //   - lctx:      llama context
@@ -15624,7 +15665,12 @@ static int llama_decode_internal(
             lctx.n_outputs = n_outputs_new;
         }
 
-        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        std::pair<int32_t, ggml_compute_threadpool_t> threads =
+            llama_swap_threadpools(lctx, n_tokens);
+
+        int32_t n_threads                    = threads.first;
+        ggml_compute_threadpool_t threadpool = threads.second;
+
         GGML_ASSERT(n_threads > 0);
 
         // non-causal masks do not use the KV cache
@@ -15686,7 +15732,7 @@ static int llama_decode_internal(
 
         llama_set_inputs(lctx, ubatch);
 
-        llama_graph_compute(lctx, gf, n_threads);
+        llama_graph_compute(lctx, gf, n_threads, threadpool);
 
         // update the kv ring buffer
         {
@@ -15863,7 +15909,11 @@ static int llama_encode_internal(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    std::pair<int32_t, ggml_compute_threadpool_t> threads =
+        llama_swap_threadpools(lctx, n_tokens);
+
+    int32_t n_threads                    = threads.first;
+    ggml_compute_threadpool_t threadpool = threads.second;
     GGML_ASSERT(n_threads > 0);
 
     ggml_backend_sched_reset(lctx.sched);
@@ -15895,7 +15945,7 @@ static int llama_encode_internal(
 
     llama_set_inputs(lctx, ubatch);
 
-    llama_graph_compute(lctx, gf, n_threads);
+    llama_graph_compute(lctx, gf, n_threads, threadpool);
 
     // extract embeddings
     if (embd) {
@@ -16177,7 +16227,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
     ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
 
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 #endif
 
     //const int64_t t_end = ggml_time_us();
@@ -16203,7 +16253,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
             llama_set_k_shift(lctx);
 
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
             need_reserve = true;
         }
@@ -17451,6 +17501,31 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
     }
 }
 
+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool) {
+    ctx->threadpool = threadpool;
+}
+
+void llama_attach_batch_threadpool(
+             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool_batch) {
+    ctx->threadpool_batch = threadpool_batch;
+}
+
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool = nullptr;
+}
+
+void llama_detach_batch_threadpool(struct llama_context * ctx) {
+    ctx->threadpool = nullptr;
+}
+
+void llama_detach_threadpools(struct llama_context * ctx) {
+    llama_detach_threadpool(ctx);
+    llama_detach_batch_threadpool(ctx);
+}
+
 void llama_backend_free(void) {
     ggml_quantize_free();
 }
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 8159e276af617..246bb227d1e19 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
 }
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);

From a0aae528bbfc2144ee3d58fc81326df7a14181cf Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 02/48] Minor fixes

---
 examples/CMakeLists.txt              | 2 +-
 examples/speculative/speculative.cpp | 7 ++++---
 ggml/src/ggml.c                      | 4 ++--
 include/llama.h                      | 2 ++
 src/llama.cpp                        | 9 +++++++++
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 247d52c6d3454..67b3d27747850 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,6 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    #add_subdirectory(speculative)
+    add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index b051a18f169c2..1616edecbbef6 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
+    if (params.draft_cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
     }
-    params.n_threads_batch = params.n_threads_batch_draft;
+
+    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
     llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
     model_dft = llama_init_dft.model;
     ctx_dft = llama_init_dft.context;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7d8d179186306..af62eb922dbbb 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18737,7 +18737,7 @@ static bool __thread_affinity(const bool * mask) {
 
     for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
         if (mask[i]) {
-            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
             CPU_SET(i, &cpuset);
         }
     }
@@ -19130,7 +19130,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
         if (threadpool->poll) {
             while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
                 // No new work. Yield and keep polling.
-                //__cpu_relax();
+                __cpu_relax();
             }
         } else {
             ggml_mutex_lock_shared(&threadpool->mutex);
diff --git a/include/llama.h b/include/llama.h
index b569c58e9ea5c..90b68f812e49f 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -439,6 +439,8 @@ extern "C" {
     LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
     LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
 
+    // Pauses all attached threadpools
+    LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);
 
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
diff --git a/src/llama.cpp b/src/llama.cpp
index 695b5eb00df5e..8e0ccaac65c27 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17526,6 +17526,15 @@ void llama_detach_threadpools(struct llama_context * ctx) {
     llama_detach_batch_threadpool(ctx);
 }
 
+void llama_pause_threadpools(struct llama_context * ctx) {
+    if (ctx->threadpool) {
+        ggml_pause_threadpool(ctx->threadpool);
+    }
+    if (ctx->threadpool_batch) {
+        ggml_pause_threadpool(ctx->threadpool_batch);
+    }
+}
+
 void llama_backend_free(void) {
     ggml_quantize_free();
 }

From d5c9c14dea91cb2ce9200201ce98adeee5bc3e6d Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 03/48] fixed use after release bug

---
 ggml/src/ggml.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index af62eb922dbbb..6df6ffa8395d9 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19363,11 +19363,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
+    enum ggml_status ret = threadpool->ec;
+
     if (disposable_threadpool) {
         ggml_release_threadpool(threadpool);
     }
 
-    return threadpool->ec;
+    return ret;
 }
 
 enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {

From 82224f84d73e54932bff3b3b606610c04df9e8ca Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 04/48] fixed a harmless race condition

---
 ggml/src/ggml.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6df6ffa8395d9..1185e785cd61e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19300,18 +19300,16 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         };
 
         threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
-    } else if (n_threads > threadpool->n_threads_max) {
-        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
-    }
-
-    // Set up work
-    threadpool->cgraph        = cgraph;
-    threadpool->cplan         = cplan;
-    threadpool->n_threads_cur = n_threads;
-
-    if (!disposable_threadpool) {
+    } else {
+        if (n_threads > threadpool->n_threads_max) {
+            GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+        }
+        // Not a disposable threadpool:
         // Reset some of the paramters that need resetting
         // No worker threads should be accessing the parameters below at this stage
+        threadpool->cgraph        = cgraph;
+        threadpool->cplan         = cplan;
+        threadpool->n_threads_cur = n_threads;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;

From 817eaf0c00e3322fc50761d214c029c60851fc62 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 05/48] Fix Android bulid issue

---
 ggml/src/ggml.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1185e785cd61e..0aed97e3cd6ed 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -121,8 +121,14 @@ static int sched_yield (void) {
     return 0;
 }
 #else
+
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+
 #include <pthread.h>
 #include <stdatomic.h>
+#include <sched.h>
 
 typedef void * thread_ret_t;
 
@@ -18724,11 +18730,6 @@ static bool __thread_priority(int32_t prio) {
 
 #else // posix?
 
-#ifndef __USE_GNU
-#define __USE_GNU
-#endif
-#include <sched.h>
-
 static bool __thread_affinity(const bool * mask) {
     cpu_set_t cpuset;
     int32_t err;

From 57637326c411e4576818294a191cbe0214978067 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:31 -0400
Subject: [PATCH 06/48] fix more race conditions

---
 ggml/src/ggml.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 0aed97e3cd6ed..d6aebd37bebdf 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18877,7 +18877,10 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
     GGML_PRINT_DEBUG("Pausing threadpool\n");
+    ggml_mutex_lock(&threadpool->mutex);
     threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
 #else
     UNUSED(threadpool);
 #endif
@@ -19348,9 +19351,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             __thread_affinity(threadpool->workers[0].cpumask);
         }
 
-        threadpool->new_work = true;
         if (!threadpool->poll) {
             ggml_mutex_lock(&threadpool->mutex);
+            threadpool->new_work = true;
             ggml_cond_broadcast(&threadpool->cond);
             ggml_mutex_unlock(&threadpool->mutex);
         }

From 3008b31b17ba768485d9d67156964f859b5597a7 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:31 -0400
Subject: [PATCH 07/48] fix deadlock for cases where cgraph.n_nodes == 1

and fix --poll case
---
 ggml/src/ggml.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d6aebd37bebdf..5375bdd31ccb8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19114,7 +19114,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
     }
 
+    if (cgraph->n_nodes == 1) {
+        // We need a barrier before disabling new_work in case we have a trivial graph
+        ggml_barrier(state->threadpool);
+    }
+
     if (!state->threadpool->disposable && state->ith == 0) {
+        // Don't need a lock, because there is a barrier after this, and only after that
+        // do the secondary threads go into standby
         state->threadpool->new_work = false;
     }
 
@@ -19356,6 +19363,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             threadpool->new_work = true;
             ggml_cond_broadcast(&threadpool->cond);
             ggml_mutex_unlock(&threadpool->mutex);
+        } else {
+            threadpool->new_work = true;
         }
     }
     // this is a work thread too

From 96d6603dc70a93dc5bd5bf26812f77f748600e3a Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 3 Aug 2024 16:14:04 -0700
Subject: [PATCH 08/48] threadpool: use cpu_get_num_math to set the default
 number of threadpool threads

This way we avoid using E-Cores and Hyperthreaded siblings.
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6c927fc17890b..796044de97947 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -285,7 +285,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
         if (role_model != nullptr) {
             cpuparams = *role_model;
         } else {
-            cpuparams.n_threads = std::thread::hardware_concurrency();
+            cpuparams.n_threads = cpu_get_num_math();
         }
     }
 

From 2953441563754f0c29b3deccb1f4a49695f62dea Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 3 Aug 2024 17:17:39 -0700
Subject: [PATCH 09/48] bench: create fresh threadpool for each test

For benchmarking it's better to start a fresh pool for each test with the exact number of threads
needed for that test. Having larger pools is suboptimal (causes more load, etc).
---
 examples/llama-bench/llama-bench.cpp | 42 ++++++++++++----------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 5a929ceddafbe..1009ac57b7be2 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -291,7 +291,6 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
-    printf("  -mt, --max-threads <n>              (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
     printf("  -C, --cpu-mask <hex>                (default: 0x0)\n");
     printf("  --cpu-strict <0|1>                  (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
     printf("  --priority <0|1|2|3>                (default: %d)\n", cmd_params_defaults.cpuparams.priority);
@@ -499,12 +498,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
                 else { invalid_param = true; break; }
             }
-        } else if (arg == "-mt" || arg == "--max-threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cpuparams.n_threads = std::stoi(argv[i]);
         } else if (arg == "-C" || arg == "--cpu-mask") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1435,21 +1428,6 @@ int main(int argc, char ** argv) {
 
     postprocess_cpu_params(params.cpuparams);
 
-    struct ggml_threadpool_params tpp;
-    tpp.n_threads      = params.cpuparams.n_threads;
-    tpp.mask_specified = params.cpuparams.mask_valid;
-    tpp.strict_cpu     = params.cpuparams.strict_cpu;
-    tpp.prio           = params.cpuparams.priority;
-    tpp.poll           = params.cpuparams.poll;
-
-    std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
-
-    struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
-    if (!threadpool) {
-        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        exit(1);
-    }
-
     for (const auto & inst : params_instances) {
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1475,6 +1453,22 @@ int main(int argc, char ** argv) {
         test t(inst, lmodel, ctx);
 
         llama_kv_cache_clear(ctx);
+
+        struct ggml_threadpool_params tpp;
+        tpp.n_threads      = t.n_threads;
+        tpp.mask_specified = params.cpuparams.mask_valid;
+        tpp.strict_cpu     = params.cpuparams.strict_cpu;
+        tpp.prio           = params.cpuparams.priority;
+        tpp.poll           = params.cpuparams.poll;
+
+        std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+
+        struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
+        if (!threadpool) {
+            LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            exit(1);
+        }
+
         llama_attach_threadpool(ctx, threadpool);
 
         // warmup run
@@ -1515,9 +1509,9 @@ int main(int argc, char ** argv) {
         llama_print_timings(ctx);
 
         llama_free(ctx);
-    }
 
-    ggml_release_threadpool(threadpool);
+        ggml_release_threadpool(threadpool);
+    }
 
     llama_free_model(lmodel);
 

From 6fcc780b5ffdd3a86b94f807be0ed1a213be2503 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 5 Aug 2024 14:25:49 -0700
Subject: [PATCH 10/48] atomics: always use stdatomics with clang and use
 relaxed memory order when polling in ggml_barrier

This also removes sched_yield() calls from ggml_barrier() to match OpenMP behavior.
---
 ggml/src/ggml.c | 63 +++++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 5375bdd31ccb8..07f8cdf754ac7 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -69,23 +69,38 @@ int ggml_sve_cnt_b = 0;
 #endif
 #include <windows.h>
 
+#if !defined(__clang__)
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
 
 #define ATOMIC_FLAG_INIT 0
 
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
 static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
 }
 static LONG atomic_load(atomic_int * ptr) {
     return InterlockedCompareExchange(ptr, 0, 0);
 }
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
 static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
     return InterlockedExchangeAdd(ptr, inc);
 }
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedExchangeAdd(ptr, inc);
 }
 static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
     return InterlockedExchange(ptr, 1);
@@ -93,6 +108,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
 static void atomic_flag_clear(atomic_flag * ptr) {
     InterlockedExchange(ptr, 0);
 }
+#else // clang
+#include <stdatomic.h>
+#endif
 
 typedef HANDLE pthread_t;
 
@@ -3030,6 +3048,19 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void __cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void __cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void __cpu_relax(void) {;}
+#endif
+
 //
 // NUMA support
 //
@@ -3094,25 +3125,19 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
     atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
 
     int n_threads = threadpool->n_threads_cur;
-    int passed_old = atomic_load(n_barrier_passed);
+    int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
 
     if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
         // last thread
         atomic_store(n_barrier, 0);
-        atomic_fetch_add(n_barrier_passed, 1);
+        atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
     } else {
         // wait for other threads
-        const int n_spin_before_sleep = 100000;
         while (true) {
-            for (int i = 0; i < n_spin_before_sleep; i++) {
-                if (atomic_load(n_barrier_passed) != passed_old) {
-                    return;
-                }
-            #if defined(__SSE3__)
-                _mm_pause();
-            #endif
+            if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
+                return;
             }
-            sched_yield();
+            __cpu_relax();
         }
     }
 }
@@ -18800,18 +18825,6 @@ static bool __thread_priority(int32_t prio) {
 
 #endif
 
-#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
-static inline void __cpu_relax(void) {
-    __asm__ volatile("yield" ::: "memory");
-}
-#elif defined(__x86_64__)
-static inline void __cpu_relax(void) {
-    _mm_pause();
-}
-#else
-static inline void __cpu_relax(void) {;}
-#endif
-
 static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
     if (!global_mask) {
         memset(local_mask, 1, GGML_MAX_N_THREADS);

From 3b62f7c1458018834da3abe559dd2974923567b2 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 6 Aug 2024 18:35:53 -0700
Subject: [PATCH 11/48] threadpool: make polling the default to match openmp
 behavior

All command line args now allow for setting poll to 0 (false).
---
 common/common.cpp                    | 34 +++++++++++++++++-----------
 common/common.h                      |  2 +-
 examples/llama-bench/llama-bench.cpp | 12 ++++++++--
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 796044de97947..f0e3fdf175bc1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -518,11 +518,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--cpu-strict") {
-        params.cpuparams.strict_cpu = true;
+        CHECK_ARG
+        params.cpuparams.strict_cpu = std::stoul(argv[i]);
         return true;
     }
     if (arg == "--poll") {
-        params.cpuparams.poll = true;
+        CHECK_ARG
+        params.cpuparams.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-tb" || arg == "--threads-batch") {
@@ -557,7 +559,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--poll-batch") {
-        params.cpuparams_batch.poll = true;
+        CHECK_ARG
+        params.cpuparams_batch.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-td" || arg == "--threads-draft") {
@@ -592,7 +595,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--poll-draft") {
-        params.draft_cpuparams.poll = true;
+        CHECK_ARG
+        params.draft_cpuparams.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-tbd" || arg == "--threads-batch-draft") {
@@ -620,7 +624,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--poll-batch-draft") {
-        params.draft_cpuparams_batch.poll = true;
+        CHECK_ARG
+        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-p" || arg == "--prompt") {
@@ -1710,34 +1715,37 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
     options.push_back({ "*",           "-C,    --cpu-mask M",           "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
     options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
-    options.push_back({ "*",           "       --cpu-strict",           "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --cpu-strict <0|1>",     "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
     options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll",                 "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
+    options.push_back({ "*",           "       --poll <0|1>",           "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
     options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
     options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
                                                                         "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
-    options.push_back({ "*",           "       --cpu-strict-batch",     "use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --cpu-strict-batch <0|1>",
+                                                                        "use strict CPU placement (default: same as --cpu-strict)"});
     options.push_back({ "*",           "       --priority-batch N",     "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
-    options.push_back({ "*",           "       --poll-batch",           "use polling to wait for work (default: --poll)"});
+    options.push_back({ "*",           "       --poll-batch <0|1>",     "use polling to wait for work (default: same as --poll"});
     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
     options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",     "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
     options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi",
                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
-    options.push_back({ "speculative", "       --cpu-strict-draft",     "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>",
+                                                                        "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
     options.push_back({ "speculative", "       --priority-draft N",     "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
-    options.push_back({ "speculative", "       --poll-draft",           "Use polling to wait for draft model work (default: same as --poll])"});
+    options.push_back({ "speculative", "       --poll-draft <0|1>",     "Use polling to wait for draft model work (default: same as --poll])"});
     options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
                                                                         "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
     options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M",
                                                                         "Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
     options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
-    options.push_back({ "speculative", "       --cpu-strict-batch-draft",
+    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
     options.push_back({ "speculative", "       --priority-batch-draft N",
                                                                         "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
-    options.push_back({ "speculative", "       --poll-batch-draft",     "Use polling to wait for draft model work (default: --poll-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft <0|1>",
+                                                                        "Use polling to wait for draft model work (default: --poll-draft)"});
 
     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
diff --git a/common/common.h b/common/common.h
index 0f0346065b425..2a1349e6269dd 100644
--- a/common/common.h
+++ b/common/common.h
@@ -73,7 +73,7 @@ struct cpu_params {
     bool     mask_valid                  = false;   // Default: any CPU
     int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
-    bool     poll                        = false;   // Use polling (busywait) to wait for work
+    bool     poll                        = true;    // Use polling (busywait) to wait for work (default matches OpenMP)
 };
 
 struct gpt_params {
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 1009ac57b7be2..571ca6dd2eb2b 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -513,9 +513,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             params.cpuparams.priority = std::stoul(argv[i]);
         } else if (arg == "--cpu-strict") {
-            params.cpuparams.strict_cpu = true;
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.strict_cpu = std::stoul(argv[i]);
         } else if (arg == "--poll") {
-            params.cpuparams.poll = true;
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.poll = std::stoul(argv[i]);
         } else if (arg == "-fa" || arg == "--flash-attn") {
             if (++i >= argc) {
                 invalid_param = true;

From dfa63778bdca7773fe01a31418ff3981166a59a9 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 7 Aug 2024 23:08:31 -0700
Subject: [PATCH 12/48] threadpool: do not wakeup threads in already paused
 threadpool

---
 ggml/src/ggml.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 07f8cdf754ac7..989b5d6924532 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18891,8 +18891,10 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
     GGML_ASSERT(!threadpool->disposable);
     GGML_PRINT_DEBUG("Pausing threadpool\n");
     ggml_mutex_lock(&threadpool->mutex);
-    threadpool->pause = true;
-    ggml_cond_broadcast(&threadpool->cond);
+    if (!threadpool->pause) {
+        threadpool->pause = true;
+        ggml_cond_broadcast(&threadpool->cond);
+    }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
     UNUSED(threadpool);
@@ -18905,8 +18907,10 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
     GGML_PRINT_DEBUG("Resuming threadpool\n");
 
     ggml_mutex_lock(&threadpool->mutex);
-    threadpool->pause = false;
-    ggml_cond_broadcast(&threadpool->cond);
+    if (threadpool->pause) {
+        threadpool->pause = false;
+        ggml_cond_broadcast(&threadpool->cond);
+    }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
     UNUSED(threadpool);

From 2e18f0d4c933241c8ecfb01e11108b992c3124c9 Mon Sep 17 00:00:00 2001
From: fmz <quic_fzaghlou@quic.com>
Date: Thu, 8 Aug 2024 05:59:20 -0700
Subject: [PATCH 13/48] fix potential race condition in check_for_work

---
 examples/llava/minicpmv-cli.cpp |  2 +-
 ggml/src/ggml.c                 | 35 +++++++++++++++++++--------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 379fc295f1101..f500ea5b944f4 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
 
 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
     auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
     if (!embeds) {
         std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
         return NULL;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 989b5d6924532..3dbc1244c0ac6 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18861,7 +18861,6 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     if (!threadpool->disposable) {
         ggml_mutex_lock(&threadpool->mutex);
     }
-    threadpool->n_threads_cur = n_threads;
     threadpool->stop = true;
     threadpool->pause = false;
     if (!threadpool->disposable) {
@@ -19154,21 +19153,27 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    do {
-        if (threadpool->poll) {
-            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
-                // No new work. Yield and keep polling.
-                __cpu_relax();
-            }
-        } else {
-            ggml_mutex_lock_shared(&threadpool->mutex);
-            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
-                // No new work. Wait for the signal.
-                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
-            }
-            ggml_mutex_unlock_shared(&threadpool->mutex);
+    if (threadpool->poll) {
+        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
+                 threadpool->stop ||
+                 threadpool->pause
+                )
+        ) {
+            // No new work. Yield and keep polling.
+            __cpu_relax();
         }
-    } while (state->ith >= threadpool->n_threads_cur);
+    } else {
+        ggml_mutex_lock_shared(&threadpool->mutex);
+        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
+                 threadpool->stop ||
+                 threadpool->pause
+                )
+        ) {
+            // No new work. Wait for the signal.
+            ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+        }
+        ggml_mutex_unlock_shared(&threadpool->mutex);
+    }
     return threadpool->new_work;
 }
 

From 48aa8eec07cc08464d610621a184e9b2c23fc28e Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Thu, 8 Aug 2024 16:26:49 -0700
Subject: [PATCH 14/48] threadpool: do not create two threadpools if their
 params are identical

---
 examples/main/main.cpp | 21 ++++++++++++++-------
 ggml/include/ggml.h    |  1 +
 ggml/src/ggml.c        | 13 +++++++++++++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 1628a42a9bd7b..5e560289dfdab 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -230,24 +230,31 @@ int main(int argc, char ** argv) {
     struct ggml_threadpool_params tpp =
             ggml_threadpool_params_from_cpu_params(params.cpuparams);
 
-    struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
-    if (!threadpool_batch) {
-        LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
-        exit(1);
-    }
     struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
     if (!threadpool) {
         LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
         exit(1);
     }
 
-    llama_attach_batch_threadpool(ctx, threadpool_batch);
     llama_attach_threadpool(ctx, threadpool);
     if (ctx_guidance) {
-        llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
         llama_attach_threadpool(ctx_guidance, threadpool);
     }
 
+    struct ggml_compute_threadpool * threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_create_threadpool(&tpp_batch);
+        if (!threadpool_batch) {
+            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            exit(1);
+        }
+
+        llama_attach_batch_threadpool(ctx, threadpool_batch);
+        if (ctx_guidance) {
+            llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
+        }
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9b29a3af72d81..173b3b22e794e 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2024,6 +2024,7 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
     GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
     GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
     GGML_API int32_t                         ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 3dbc1244c0ac6..5a6d313aef7bd 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19222,6 +19222,19 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
 
 #endif // GGML_USE_OPENMP
 
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    if (p0->mask_specified != p1->mask_specified) return false;
+    if (p0->mask_specified) {
+        return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+    }
+
+    return true;
+}
+
 static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     struct ggml_threadpool_params * tpp,
                              bool   disposable,

From 494e27c793c6952f35313d243c8ff356f9554e07 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 10 Aug 2024 16:12:06 -0700
Subject: [PATCH 15/48] threadpool: reduce pause/resume/wakeup overhead in
 common cases

We now start threadpool in paused state only if we have two.
The resume is now implicit (ie new work) which allows for reduced locking and context-switch overhead.
---
 common/common.cpp      |  1 +
 examples/main/main.cpp | 25 ++++++++++++++-----------
 ggml/include/ggml.h    |  1 +
 ggml/src/ggml.c        | 42 +++++++++++++++++++++++++++++++-----------
 src/llama.cpp          |  3 ---
 5 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index f0e3fdf175bc1..08a4bd328c355 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2615,6 +2615,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
     tpp.prio       = params.priority;
     tpp.poll       = params.poll;
     tpp.strict_cpu = params.strict_cpu;
+    tpp.paused     = false;
 
     return tpp;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 5e560289dfdab..42f6eb5d473b1 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -230,17 +230,6 @@ int main(int argc, char ** argv) {
     struct ggml_threadpool_params tpp =
             ggml_threadpool_params_from_cpu_params(params.cpuparams);
 
-    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
-    if (!threadpool) {
-        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        exit(1);
-    }
-
-    llama_attach_threadpool(ctx, threadpool);
-    if (ctx_guidance) {
-        llama_attach_threadpool(ctx_guidance, threadpool);
-    }
-
     struct ggml_compute_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
         threadpool_batch = ggml_create_threadpool(&tpp_batch);
@@ -253,6 +242,20 @@ int main(int argc, char ** argv) {
         if (ctx_guidance) {
             llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
         }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_threadpool(ctx, threadpool);
+    if (ctx_guidance) {
+        llama_attach_threadpool(ctx_guidance, threadpool);
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 173b3b22e794e..1c5275408789e 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -633,6 +633,7 @@ extern "C" {
         int32_t prio;
         bool    poll;
         bool    strict_cpu;
+        bool    paused;
     };
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 5a6d313aef7bd..d10626ac14c6f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18885,14 +18885,27 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     GGML_ALIGNED_FREE(threadpool);
 }
 
+#ifndef GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void __ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+
+static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+
 void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
-    GGML_PRINT_DEBUG("Pausing threadpool\n");
     ggml_mutex_lock(&threadpool->mutex);
     if (!threadpool->pause) {
-        threadpool->pause = true;
-        ggml_cond_broadcast(&threadpool->cond);
+       __ggml_pause_threadpool(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -18903,12 +18916,9 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
-    GGML_PRINT_DEBUG("Resuming threadpool\n");
-
     ggml_mutex_lock(&threadpool->mutex);
     if (threadpool->pause) {
-        threadpool->pause = false;
-        ggml_cond_broadcast(&threadpool->cond);
+       __ggml_resume_threadpool(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -19250,7 +19260,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
-        threadpool->pause            = disposable ? false : true;
+        threadpool->pause            = disposable ? false : tpp->paused;
         threadpool->new_work         = false;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
@@ -19340,9 +19350,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         struct ggml_threadpool_params ttp = {
             .mask_specified = false,
             .n_threads      = n_threads,
-            .prio           = 1,
+            .prio           = 0,
             .poll           = false,
-            .strict_cpu     = false
+            .strict_cpu     = false,
+            .paused         = false
         };
 
         threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
@@ -19396,10 +19407,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         if (!threadpool->poll) {
             ggml_mutex_lock(&threadpool->mutex);
             threadpool->new_work = true;
-            ggml_cond_broadcast(&threadpool->cond);
+            if (threadpool->pause) {
+               __ggml_resume_threadpool(threadpool);
+            } else {
+               ggml_cond_broadcast(&threadpool->cond);
+            }
             ggml_mutex_unlock(&threadpool->mutex);
         } else {
             threadpool->new_work = true;
+            if (threadpool->pause) {
+                ggml_mutex_lock(&threadpool->mutex);
+                __ggml_resume_threadpool(threadpool);
+                ggml_mutex_unlock(&threadpool->mutex);
+            }
         }
     }
     // this is a work thread too
diff --git a/src/llama.cpp b/src/llama.cpp
index 8e0ccaac65c27..8d3e6aaf43119 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15541,17 +15541,14 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
         // Switch between the 2 threadpools as needed
         if (n_tokens > 1) {
             ggml_pause_threadpool(lctx.threadpool);
-            ggml_resume_threadpool(lctx.threadpool_batch);
             threadpool = lctx.threadpool_batch;
             n_threads = cparams.n_threads_batch;
         } else {
             ggml_pause_threadpool(lctx.threadpool_batch);
-            ggml_resume_threadpool(lctx.threadpool);
             threadpool = lctx.threadpool;
             n_threads = cparams.n_threads;
         }
     } else if (lctx.threadpool) {
-        ggml_resume_threadpool(lctx.threadpool);
         threadpool = lctx.threadpool;
         n_threads = cparams.n_threads;
     }

From b630acdb73d4bfdf93eb206658d0f5f983edc2fe Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sun, 11 Aug 2024 11:20:32 -0700
Subject: [PATCH 16/48] threadpool: add support for hybrid polling

poll params (--poll, ...) now specify "polling level", i.e. how aggresively we poll before waiting on cond.var.
poll=0 means no polling, 1 means poll for 128K rounds then wait, 2 for 256K rounds, ...

The default value of 50 (ie 50x128K rounds) seems like a decent default across modern platforms.
We can tune this further as things evolve.
---
 common/common.cpp   |  2 +-
 common/common.h     |  2 +-
 ggml/include/ggml.h | 14 ++++----
 ggml/src/ggml.c     | 86 +++++++++++++++++++++++++--------------------
 4 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 08a4bd328c355..2fc4d6da7a73f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1717,7 +1717,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
     options.push_back({ "*",           "       --cpu-strict <0|1>",     "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
     options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll <0|1>",           "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
+    options.push_back({ "*",           "       --poll <0...100>",       "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
     options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
     options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
diff --git a/common/common.h b/common/common.h
index 2a1349e6269dd..c2b72f6de0f4b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -73,7 +73,7 @@ struct cpu_params {
     bool     mask_valid                  = false;   // Default: any CPU
     int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
-    bool     poll                        = true;    // Use polling (busywait) to wait for work (default matches OpenMP)
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
 
 struct gpt_params {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1c5275408789e..1afba914d2df1 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -627,13 +627,13 @@ extern "C" {
     typedef bool (*ggml_abort_callback)(void * data);
 
     struct ggml_threadpool_params {
-        bool    cpumask[GGML_MAX_N_THREADS];
-        bool    mask_specified;
-        int32_t n_threads;
-        int32_t prio;
-        bool    poll;
-        bool    strict_cpu;
-        bool    paused;
+        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
+        bool     mask_specified;              // mask is non-empty
+        int32_t  n_threads;                   // number of threads
+        int32_t  prio;                        // thread priority
+        uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool     strict_cpu;                  // strict cpu placement
+        bool     paused;                      // start in paused state
     };
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d10626ac14c6f..9bea4e0af1a3d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1973,7 +1973,7 @@ struct ggml_compute_threadpool {
 
     int32_t      prio;       // Scheduling priority
     bool         disposable; // Doesn't initialize a conv-var
-    bool         poll;       // Use polling (busywait)  // TODO
+    uint32_t     poll;       // Polling level (0 - no polling)
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
@@ -19156,35 +19156,50 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     return 0;
 }
 
+#ifndef GGML_USE_OPENMP
 
+static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
+}
 
-#ifndef GGML_USE_OPENMP
+static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    if (threadpool->stop || threadpool->pause) return true;
+    return ggml_graph_compute_got_work(state);
+}
+
+static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+    // Perhaps, we can adjust it dynamically based on load and things.
+    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+
+    for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
+        // No new work. Keep polling.
+        __cpu_relax();
+    }
+
+    return ggml_graph_compute_got_work(state);
+}
 
 static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    if (threadpool->poll) {
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
-                 threadpool->stop ||
-                 threadpool->pause
-                )
-        ) {
-            // No new work. Yield and keep polling.
-            __cpu_relax();
-        }
-    } else {
-        ggml_mutex_lock_shared(&threadpool->mutex);
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
-                 threadpool->stop ||
-                 threadpool->pause
-                )
-        ) {
-            // No new work. Wait for the signal.
-            ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
-        }
-        ggml_mutex_unlock_shared(&threadpool->mutex);
+    if (ggml_graph_compute_poll_for_work(state)) {
+        return ggml_graph_compute_got_work(state);
+    }
+
+    ggml_mutex_lock_shared(&threadpool->mutex);
+    while (!ggml_graph_compute_ready(state)) {
+        // No new work. Wait for the signal.
+        GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
+        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
     }
-    return threadpool->new_work;
+    ggml_mutex_unlock_shared(&threadpool->mutex);
+
+    return ggml_graph_compute_got_work(state);
 }
 
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@@ -19404,24 +19419,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             __thread_affinity(threadpool->workers[0].cpumask);
         }
 
-        if (!threadpool->poll) {
-            ggml_mutex_lock(&threadpool->mutex);
-            threadpool->new_work = true;
-            if (threadpool->pause) {
-               __ggml_resume_threadpool(threadpool);
-            } else {
-               ggml_cond_broadcast(&threadpool->cond);
-            }
-            ggml_mutex_unlock(&threadpool->mutex);
+        // always take the mutex here because the worker threads are doing hybrid poll/wait
+
+        ggml_mutex_lock(&threadpool->mutex);
+        threadpool->new_work = true;
+        if (!threadpool->pause) {
+           ggml_cond_broadcast(&threadpool->cond);
         } else {
-            threadpool->new_work = true;
-            if (threadpool->pause) {
-                ggml_mutex_lock(&threadpool->mutex);
-                __ggml_resume_threadpool(threadpool);
-                ggml_mutex_unlock(&threadpool->mutex);
-            }
+           // resume does cond broadcast
+           __ggml_resume_threadpool(threadpool);
         }
+        ggml_mutex_unlock(&threadpool->mutex);
     }
+
     // this is a work thread too
     ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif

From 9d3e78c6b83ad349edce6820bc1678486a38f80e Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 12 Aug 2024 19:04:01 -0700
Subject: [PATCH 17/48] threadpool: reduce the number of barrier required

New work is now indicated with an atomic counter that is incremented for
each new graph that needs to be computed.
This removes the need for extra barrier for clearing the "new_work" and
removes the special case for trivial graphs.
---
 ggml/src/ggml.c | 78 +++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 9bea4e0af1a3d..39b9b27dcbc4d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1959,13 +1959,13 @@ struct ggml_compute_threadpool {
     struct ggml_cplan  * cplan;
 
     // synchronization primitives
+    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
     atomic_int n_barrier;
     atomic_int n_barrier_passed;
     atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
     volatile bool stop;      // Used for stopping the threadpool altogether
     volatile bool pause;     // Used for pausing the threadpool or individual threads
-    volatile bool new_work;  // Set when there is work to be done, unset after it's done
 
     struct ggml_compute_state * workers;   // per thread state
     int32_t                     n_threads_max; // number of threads in the pool
@@ -1987,6 +1987,8 @@ struct ggml_compute_state {
     ggml_thread_t thrd;
     bool cpumask[GGML_MAX_N_THREADS];
     bool mask_specified;
+    int  last_graph;
+    bool pending;
 #endif
     struct ggml_compute_threadpool * threadpool;
     int ith;
@@ -19118,55 +19120,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.threadpool=*/ state->threadpool,
     };
 
-    struct ggml_tensor * node = cgraph->nodes[0];
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+        struct ggml_tensor * node = cgraph->nodes[node_n];
 
-    ggml_compute_forward(&params, node);
-    if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-        state->threadpool->ec = GGML_STATUS_ABORTED;
-    }
-
-    for (int node_n = 1; node_n < cgraph->n_nodes; node_n++) {
-        ggml_barrier(state->threadpool);
-
-        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
-            break;
-        }
-
-        node = cgraph->nodes[node_n];
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
             state->threadpool->ec = GGML_STATUS_ABORTED;
         }
-    }
 
-    if (cgraph->n_nodes == 1) {
-        // We need a barrier before disabling new_work in case we have a trivial graph
         ggml_barrier(state->threadpool);
-    }
 
-    if (!state->threadpool->disposable && state->ith == 0) {
-        // Don't need a lock, because there is a barrier after this, and only after that
-        // do the secondary threads go into standby
-        state->threadpool->new_work = false;
+        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
+            break;
+        }
     }
 
-    ggml_barrier(state->threadpool);
-
     return 0;
 }
 
 #ifndef GGML_USE_OPENMP
 
-static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
-    struct ggml_compute_threadpool * threadpool = state->threadpool;
-    return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
-}
-
 static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
-    if (threadpool->stop || threadpool->pause) return true;
-    return ggml_graph_compute_got_work(state);
+    if (threadpool->stop || threadpool->pause || state->pending) { return true; }
+
+    // check for new graph/work
+    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    if (new_graph != state->last_graph) {
+        state->pending    = (state->ith < threadpool->n_threads_cur);
+        state->last_graph = new_graph;
+    }
+
+    return state->pending;
 }
 
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
@@ -19181,14 +19167,14 @@ static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state *
         __cpu_relax();
     }
 
-    return ggml_graph_compute_got_work(state);
+    return state->pending;
 }
 
-static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
     if (ggml_graph_compute_poll_for_work(state)) {
-        return ggml_graph_compute_got_work(state);
+        return state->pending;
     }
 
     ggml_mutex_lock_shared(&threadpool->mutex);
@@ -19199,7 +19185,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
     }
     ggml_mutex_unlock_shared(&threadpool->mutex);
 
-    return ggml_graph_compute_got_work(state);
+    return state->pending;
 }
 
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@@ -19229,8 +19215,10 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
         // Check if there is new work
         // The main thread is the only one that can dispatch new work
 
-        bool new_work = ggml_graph_compute_check_for_work(state);
-        if (new_work) {
+        ggml_graph_compute_check_for_work(state);
+        if (state->pending) {
+            state->pending = false;
+
             int64_t ret = (int64_t) ggml_graph_compute_thread(state);
             if (ret == GGML_EXIT_ABORTED)
                 return (thread_ret_t) ret;
@@ -19271,12 +19259,12 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     {
         threadpool->cgraph           = cgraph;
         threadpool->cplan            = cplan;
+        threadpool->n_graph          = 0;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
         threadpool->pause            = disposable ? false : tpp->paused;
-        threadpool->new_work         = false;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
         threadpool->n_threads_cur    = disposable ? tpp->n_threads : 0;
@@ -19319,7 +19307,9 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
             .thrd           = 0,
             .mask_specified = tpp->mask_specified,
             .threadpool     = threadpool,
-            .ith            = j
+            .ith            = j,
+            .last_graph     = 0,
+            .pending        = false
         };
 
         if (tpp->mask_specified) {
@@ -19422,12 +19412,12 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         // always take the mutex here because the worker threads are doing hybrid poll/wait
 
         ggml_mutex_lock(&threadpool->mutex);
-        threadpool->new_work = true;
-        if (!threadpool->pause) {
-           ggml_cond_broadcast(&threadpool->cond);
-        } else {
+        atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+        if (threadpool->pause) {
            // resume does cond broadcast
            __ggml_resume_threadpool(threadpool);
+        } else {
+           ggml_cond_broadcast(&threadpool->cond);
         }
         ggml_mutex_unlock(&threadpool->mutex);
     }

From 538bd9f730f5996e1cca496fbd348a9b9b946ebf Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 12 Aug 2024 22:18:16 -0700
Subject: [PATCH 18/48] threadpool: remove special-casing for disposable
 threadpools

With the efficient hybrid polling there is no need to make disposable pools any different.
This simplifies the overall logic and reduces branching.

Include n_threads in debug print for disposable threadpool.

Declare pause and stop flags as atomic_bool
This doesn't actually generate any memory barriers and simply informs
the thread sanitizer that these flags can be written & read by different
threads without locking.
---
 ggml/src/ggml.c | 122 +++++++++++++++++++++++-------------------------
 1 file changed, 59 insertions(+), 63 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 39b9b27dcbc4d..4d4dbdfac74b5 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1964,16 +1964,16 @@ struct ggml_compute_threadpool {
     atomic_int n_barrier_passed;
     atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
-    volatile bool stop;      // Used for stopping the threadpool altogether
-    volatile bool pause;     // Used for pausing the threadpool or individual threads
+    // these are atomic as an annotation for thread-sanitizer
+    atomic_bool stop;         // Used for stopping the threadpool altogether
+    atomic_bool pause;        // Used for pausing the threadpool or individual threads
 
     struct ggml_compute_state * workers;   // per thread state
     int32_t                     n_threads_max; // number of threads in the pool
     int32_t                     n_threads_cur; // number of threads used in the current graph
 
-    int32_t      prio;       // Scheduling priority
-    bool         disposable; // Doesn't initialize a conv-var
-    uint32_t     poll;       // Polling level (0 - no polling)
+    int32_t      prio;        // Scheduling priority
+    uint32_t     poll;        // Polling level (0 - no polling)
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
@@ -18860,15 +18860,13 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     struct ggml_compute_state* workers = threadpool->workers;
     const int32_t n_threads = threadpool->n_threads_max;
 
-    if (!threadpool->disposable) {
-        ggml_mutex_lock(&threadpool->mutex);
-    }
+    ggml_mutex_lock(&threadpool->mutex);
+
     threadpool->stop = true;
     threadpool->pause = false;
-    if (!threadpool->disposable) {
-        ggml_cond_broadcast(&threadpool->cond);
-        ggml_mutex_unlock(&threadpool->mutex);
-    }
+
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
 
     for (int32_t j = 1; j < n_threads; j++) {
         int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
@@ -18878,10 +18876,8 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
 
     GGML_ALIGNED_FREE(workers);
 
-    if (!threadpool->disposable) {
-        ggml_mutex_destroy(&threadpool->mutex);
-        ggml_cond_destroy(&threadpool->cond);
-    }
+    ggml_mutex_destroy(&threadpool->mutex);
+    ggml_cond_destroy(&threadpool->cond);
 #endif // GGML_USE_OPENMP
 
     GGML_ALIGNED_FREE(threadpool);
@@ -18904,7 +18900,6 @@ static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool
 
 void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
-    GGML_ASSERT(!threadpool->disposable);
     ggml_mutex_lock(&threadpool->mutex);
     if (!threadpool->pause) {
        __ggml_pause_threadpool(threadpool);
@@ -18917,7 +18912,6 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 
 void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
-    GGML_ASSERT(!threadpool->disposable);
     ggml_mutex_lock(&threadpool->mutex);
     if (threadpool->pause) {
        __ggml_resume_threadpool(threadpool);
@@ -18934,7 +18928,7 @@ struct ggml_cplan ggml_graph_plan(
     struct ggml_compute_threadpool * threadpool) {
 
     if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool\n");
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads);
     }
     if (n_threads <= 0) {
         n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
@@ -19143,7 +19137,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
 static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
-    if (threadpool->stop || threadpool->pause || state->pending) { return true; }
+
+    if (state->pending || threadpool->stop || threadpool->pause) { return true; }
 
     // check for new graph/work
     int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
@@ -19192,8 +19187,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    GGML_ASSERT(!threadpool->disposable);
-
     __thread_priority(threadpool->prio);
     if (state->mask_specified)
         __thread_affinity(state->cpumask);
@@ -19209,6 +19202,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
             GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
             ggml_mutex_unlock_shared(&threadpool->mutex);
         }
+
         // This needs to be checked for after the cond_wait
         if (threadpool->stop) break;
 
@@ -19233,6 +19227,25 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     return (thread_ret_t) 0;
 }
 
+// Start processing new graph
+static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpool)
+{
+    // always take the mutex here because the worker threads are doing hybrid poll/wait
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+
+    if (threadpool->pause) {
+       // resume does cond broadcast
+       __ggml_resume_threadpool(threadpool);
+    } else {
+       ggml_cond_broadcast(&threadpool->cond);
+    }
+
+    ggml_mutex_unlock(&threadpool->mutex);
+}
+
 #endif // GGML_USE_OPENMP
 
 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
@@ -19250,7 +19263,6 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
 
 static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     struct ggml_threadpool_params * tpp,
-                             bool   disposable,
                struct ggml_cgraph * cgraph,
                 struct ggml_cplan * cplan) {
 
@@ -19264,11 +19276,10 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
-        threadpool->pause            = disposable ? false : tpp->paused;
+        threadpool->pause            = tpp->paused;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
-        threadpool->n_threads_cur    = disposable ? tpp->n_threads : 0;
-        threadpool->disposable       = disposable;
+        threadpool->n_threads_cur    = tpp->n_threads;
         threadpool->poll             = tpp->poll;
         threadpool->prio             = tpp->prio;
 
@@ -19278,10 +19289,8 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     }
 
 #ifndef GGML_USE_OPENMP
-    if (!disposable) {
-        ggml_mutex_init(&threadpool->mutex);
-        ggml_cond_init(&threadpool->cond);
-    }
+    ggml_mutex_init(&threadpool->mutex);
+    ggml_cond_init(&threadpool->cond);
 #endif // GGML_USE_OPENMP
 
     struct ggml_compute_state * workers =
@@ -19316,14 +19325,12 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
             __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
         }
 
-        // Disposable threadpools need to have a valid cplan and cgraph immediately.
-        thread_ret_t (*thread_entrypoint)(void*) = disposable ? ggml_graph_compute_thread : ggml_graph_compute_secondary_thread;
         // Spin threads for all secondary workers
         if (j > 0) {
             int32_t rc = ggml_thread_create(
                 &workers[j].thrd,
                 NULL,
-                thread_entrypoint,
+                ggml_graph_compute_secondary_thread,
                 &workers[j]
             );
             GGML_ASSERT(rc == 0);
@@ -19335,7 +19342,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
 }
 
 struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) {
-    return ggml_create_threadpool_impl(tpp, false, NULL, NULL);
+    return ggml_create_threadpool_impl(tpp, NULL, NULL);
 }
 
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
@@ -19349,35 +19356,35 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     bool disposable_threadpool = false;
 
     if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("NOTE: No threadpool was specified in this cplan. Will create a disposable threadpool\n");
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads);
         disposable_threadpool = true;
 
         struct ggml_threadpool_params ttp = {
             .mask_specified = false,
             .n_threads      = n_threads,
             .prio           = 0,
-            .poll           = false,
+            .poll           = 1,
             .strict_cpu     = false,
             .paused         = false
         };
 
-        threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
+        threadpool = ggml_create_threadpool_impl(&ttp, cgraph, cplan);
     } else {
-        if (n_threads > threadpool->n_threads_max) {
-            GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
-        }
-        // Not a disposable threadpool:
-        // Reset some of the paramters that need resetting
+        // Reset some of the parameters that need resetting
         // No worker threads should be accessing the parameters below at this stage
-        threadpool->cgraph        = cgraph;
-        threadpool->cplan         = cplan;
-        threadpool->n_threads_cur = n_threads;
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_threads_cur    = n_threads;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->ec               = GGML_STATUS_SUCCESS;
     }
 
+    if (n_threads > threadpool->n_threads_max) {
+        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+    }
+
 #ifdef GGML_USE_OPENMP
     if (n_threads > 1) {
         #pragma omp parallel num_threads(n_threads)
@@ -19403,26 +19410,15 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         ggml_graph_compute_thread(&worker);
     }
 #else
-    if (!disposable_threadpool) {
-        // Update main thread affinity to match the current threadpool
-        if (threadpool->workers[0].mask_specified) {
-            __thread_affinity(threadpool->workers[0].cpumask);
-        }
-
-        // always take the mutex here because the worker threads are doing hybrid poll/wait
-
-        ggml_mutex_lock(&threadpool->mutex);
-        atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
-        if (threadpool->pause) {
-           // resume does cond broadcast
-           __ggml_resume_threadpool(threadpool);
-        } else {
-           ggml_cond_broadcast(&threadpool->cond);
-        }
-        ggml_mutex_unlock(&threadpool->mutex);
+    // Update main thread affinity to match the current threadpool
+    if (threadpool->workers[0].mask_specified) {
+        __thread_affinity(threadpool->workers[0].cpumask);
     }
 
-    // this is a work thread too
+    // Kick all threads to start the new graph
+    ggml_graph_compute_kickoff(threadpool);
+
+    // This is a work thread too
     ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif
 

From db45b6d3a95a97436af9dbd639afbe90d06791db Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Thu, 15 Aug 2024 16:20:42 -0700
Subject: [PATCH 19/48] threadpool: do not clear barrier counters between
 graphs computes (fixes race with small graphs)

This fixes the race condition with very small graphs where the main thread happens to
start a new graph while the workers are just about to exit from barriers.
---
 ggml/src/ggml.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4d4dbdfac74b5..b1400923ce4f2 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19375,8 +19375,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         threadpool->cgraph           = cgraph;
         threadpool->cplan            = cplan;
         threadpool->n_threads_cur    = n_threads;
-        threadpool->n_barrier        = 0;
-        threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->ec               = GGML_STATUS_SUCCESS;
     }

From 307fece5d76a204ef668613f4e19f8e321f71149 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 20 Aug 2024 18:43:39 -0700
Subject: [PATCH 20/48] threadpool: use relaxed order for chunk sync

Full memory barrier is an overkill for this since each thread works on different chunk
---
 ggml/src/ggml.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b1400923ce4f2..15448a6334b9b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -88,6 +88,10 @@ typedef enum {
 static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
 }
+static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
+    // TODO: add support for explicit memory order
+    InterlockedExchange(ptr, val);
+}
 static LONG atomic_load(atomic_int * ptr) {
     return InterlockedCompareExchange(ptr, 0, 0);
 }
@@ -12472,7 +12476,7 @@ UseGgmlGemm1:;
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&params->threadpool->current_chunk, nth);
+        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
     }
 
     ggml_barrier(params->threadpool);
@@ -12583,7 +12587,7 @@ UseGgmlGemm2:;
             break;
         }
 
-        current_chunk = atomic_fetch_add(&params->threadpool->current_chunk, 1);
+        current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
     }
 }
 

From 63a0dad83c0d7c781d30ea537624282be2c78ea5 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 10:09:51 -0700
Subject: [PATCH 21/48] threadpool: remove abort_callback from threadpool state

---
 ggml/src/ggml.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 15448a6334b9b..6bfc9c36f490b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1979,9 +1979,6 @@ struct ggml_compute_threadpool {
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
 
-    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
-    void * abort_callback_data;
-
     enum ggml_status ec;
 };
 
@@ -19286,10 +19283,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         threadpool->n_threads_cur    = tpp->n_threads;
         threadpool->poll             = tpp->poll;
         threadpool->prio             = tpp->prio;
-
-        threadpool->abort_callback      = NULL;
-        threadpool->abort_callback_data = NULL;
-        threadpool->ec                  = GGML_STATUS_SUCCESS;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
     }
 
 #ifndef GGML_USE_OPENMP

From 2358bb364b28170b001ff3f8a24ddafe0e79b9c5 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 10:27:53 -0700
Subject: [PATCH 22/48] threadpool: better naming for thread/cpumask releated
 functions

---
 ggml/src/ggml.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6bfc9c36f490b..bf9c6b20c2fb5 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3053,15 +3053,15 @@ static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size
 
 // Helpers for polling loops
 #if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
-static inline void __cpu_relax(void) {
+static inline void ggml_thread_cpu_relax(void) {
     __asm__ volatile("yield" ::: "memory");
 }
 #elif defined(__x86_64__)
-static inline void __cpu_relax(void) {
+static inline void ggml_thread_cpu_relax(void) {
     _mm_pause();
 }
 #else
-static inline void __cpu_relax(void) {;}
+static inline void ggml_thread_cpu_relax(void) {;}
 #endif
 
 //
@@ -3140,7 +3140,7 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
             if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
                 return;
             }
-            __cpu_relax();
+            ggml_thread_cpu_relax();
         }
     }
 }
@@ -18667,7 +18667,7 @@ enum {
 #include "windows.h"
 
 // TODO: support > 64 CPUs
-static bool __thread_affinity(bool * mask) {
+static bool ggml_thread_apply_affinity(bool * mask) {
     HANDLE    h = GetCurrentThread();
     uint64_t  bitmask = 0ULL;
 
@@ -18701,7 +18701,7 @@ static bool __thread_affinity(bool * mask) {
     return m != 0;
 }
 
-static bool __process_priority(int32_t prio) {
+static bool ggml_thread_apply_process_priority(int32_t prio) {
     DWORD p = NORMAL_PRIORITY_CLASS;
 
     switch (prio) {
@@ -18714,7 +18714,7 @@ static bool __process_priority(int32_t prio) {
     return SetPriorityClass(GetCurrentProcess(), p);
 }
 
-static bool __thread_priority(int32_t prio) {
+static bool ggml_thread_apply_thread_priority(int32_t prio) {
     DWORD p = NORMAL_PRIORITY_CLASS;
 
     switch (prio) {
@@ -18732,12 +18732,12 @@ static bool __thread_priority(int32_t prio) {
 #include <sys/types.h>
 #include <sys/resource.h>
 
-static bool __thread_affinity(const bool * mask) {
+static bool ggml_thread_apply_affinity(const bool * mask) {
     UNUSED(mask);
     return true;
 }
 
-static bool __process_priority(int32_t prio) {
+static bool ggml_thread_apply_process_prio(int32_t prio) {
     int32_t p = 0;
 
     switch (prio) {
@@ -18751,14 +18751,14 @@ static bool __process_priority(int32_t prio) {
     return r != -1;
 }
 
-static bool __thread_priority(int32_t prio) {
+static bool ggml_thread_apply_thread_priority(int32_t prio) {
     UNUSED(prio);
     return true;
 }
 
 #else // posix?
 
-static bool __thread_affinity(const bool * mask) {
+static bool ggml_thread_apply_affinity(const bool * mask) {
     cpu_set_t cpuset;
     int32_t err;
 
@@ -18787,7 +18787,7 @@ static bool __thread_affinity(const bool * mask) {
     return true;
 }
 
-static bool __process_priority(int32_t prio) {
+static bool ggml_thread_apply_process_prio(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
 
@@ -18807,7 +18807,7 @@ static bool __process_priority(int32_t prio) {
     return true;
 }
 
-static bool __thread_priority(int32_t prio) {
+static bool ggml_thread_apply_thread_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
     switch (prio) {
@@ -18828,7 +18828,7 @@ static bool __thread_priority(int32_t prio) {
 
 #endif
 
-static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
     if (!global_mask) {
         memset(local_mask, 1, GGML_MAX_N_THREADS);
         return;
@@ -19160,7 +19160,7 @@ static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state *
 
     for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
         // No new work. Keep polling.
-        __cpu_relax();
+        ggml_thread_cpu_relax();
     }
 
     return state->pending;
@@ -19188,9 +19188,9 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    __thread_priority(threadpool->prio);
+    ggml_thread_apply_thread_priority(threadpool->prio);
     if (state->mask_specified)
-        __thread_affinity(state->cpumask);
+        ggml_thread_apply_affinity(state->cpumask);
 
     while (true) {
         // Check if we need to sleep
@@ -19306,8 +19306,8 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
 #else  // Not using OPENMP
     int32_t cpumask_iter = 0;
 
-    __process_priority(tpp->prio);
-    __thread_priority(tpp->prio);
+    ggml_thread_apply_process_prio(tpp->prio);
+    ggml_thread_apply_thread_priority(tpp->prio);
 
     for (int j = 0; j < tpp->n_threads; j++) {
         workers[j] = (struct ggml_compute_state) {
@@ -19320,7 +19320,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         };
 
         if (tpp->mask_specified) {
-            __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+            ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
         }
 
         // Spin threads for all secondary workers
@@ -19408,7 +19408,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 #else
     // Update main thread affinity to match the current threadpool
     if (threadpool->workers[0].mask_specified) {
-        __thread_affinity(threadpool->workers[0].cpumask);
+        ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
     }
 
     // Kick all threads to start the new graph

From 4a4d71501b61f96c78c8319013dd35011082239a Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 10:50:06 -0700
Subject: [PATCH 23/48] threadpool: consistent use of int type for n_threads
 params

---
 common/common.h                          |  4 ++--
 examples/benchmark/benchmark-matmult.cpp |  2 +-
 examples/main/main.cpp                   |  2 +-
 ggml/include/ggml.h                      |  4 ++--
 ggml/src/ggml.c                          | 16 ++++++++--------
 include/llama.h                          | 10 +++++-----
 src/llama.cpp                            | 16 ++++++++--------
 7 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/common/common.h b/common/common.h
index c2b72f6de0f4b..a665716bee133 100644
--- a/common/common.h
+++ b/common/common.h
@@ -68,7 +68,7 @@ enum dimre_method {
 };
 
 struct cpu_params {
-    int32_t  n_threads                   = -1;
+    int      n_threads                   = -1;
     bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
     bool     mask_valid                  = false;   // Default: any CPU
     int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
@@ -214,7 +214,7 @@ struct gpt_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index e78f6b388ef6e..97622f4f4fd18 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
 #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
 
 struct benchmark_params_struct {
-    int32_t n_threads     = 1;
+    int     n_threads     = 1;
     int32_t n_iterations  = 10;
 };
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 42f6eb5d473b1..a64c1bc258f59 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
 
     LOG("%s: llama threadpool init = n_threads = %d\n",
         __func__,
-        (int32_t) params.cpuparams.n_threads
+        (int) params.cpuparams.n_threads
     );
     struct ggml_threadpool_params tpp_batch =
             ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1afba914d2df1..1f9e6756ef518 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -629,7 +629,7 @@ extern "C" {
     struct ggml_threadpool_params {
         bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
         bool     mask_specified;              // mask is non-empty
-        int32_t  n_threads;                   // number of threads
+        int      n_threads;                   // number of threads
         int32_t  prio;                        // thread priority
         uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
         bool     strict_cpu;                  // strict cpu placement
@@ -2028,7 +2028,7 @@ extern "C" {
     GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
     GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
     GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
-    GGML_API int32_t                         ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
+    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
     GGML_API void                            ggml_pause_threadpool        (struct ggml_compute_threadpool * threadpool);
     GGML_API void                            ggml_resume_threadpool       (struct ggml_compute_threadpool * threadpool);
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bf9c6b20c2fb5..2c8f5a7e35d94 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1973,8 +1973,8 @@ struct ggml_compute_threadpool {
     atomic_bool pause;        // Used for pausing the threadpool or individual threads
 
     struct ggml_compute_state * workers;   // per thread state
-    int32_t                     n_threads_max; // number of threads in the pool
-    int32_t                     n_threads_cur; // number of threads used in the current graph
+    int          n_threads_max; // number of threads in the pool
+    int          n_threads_cur; // number of threads used in the current graph
 
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
@@ -18859,7 +18859,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
 
 #ifndef GGML_USE_OPENMP
     struct ggml_compute_state* workers = threadpool->workers;
-    const int32_t n_threads = threadpool->n_threads_max;
+    const int n_threads = threadpool->n_threads_max;
 
     ggml_mutex_lock(&threadpool->mutex);
 
@@ -18869,7 +18869,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     ggml_cond_broadcast(&threadpool->cond);
     ggml_mutex_unlock(&threadpool->mutex);
 
-    for (int32_t j = 1; j < n_threads; j++) {
+    for (int j = 1; j < n_threads; j++) {
         int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
         GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
         UNUSED(rc);
@@ -18925,11 +18925,11 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 
 struct ggml_cplan ggml_graph_plan(
           const struct ggml_cgraph * cgraph,
-                           int32_t   n_threads,
+                           int       n_threads,
     struct ggml_compute_threadpool * threadpool) {
 
     if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads);
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
     }
     if (n_threads <= 0) {
         n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
@@ -19348,13 +19348,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     GGML_ASSERT(cplan->n_threads > 0);
     GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
 
-    int32_t n_threads                           = cplan->n_threads;
+    int n_threads                               = cplan->n_threads;
     struct ggml_compute_threadpool * threadpool = cplan->threadpool;
 
     bool disposable_threadpool = false;
 
     if (threadpool == NULL) {
-        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads);
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
         disposable_threadpool = true;
 
         struct ggml_threadpool_params ttp = {
diff --git a/include/llama.h b/include/llama.h
index 90b68f812e49f..7b103261d3982 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -304,8 +304,8 @@ extern "C" {
         uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
         uint32_t n_ubatch;          // physical maximum batch size
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        uint32_t n_threads;         // number of threads to use for generation
-        uint32_t n_threads_batch;   // number of threads to use for batch processing
+        int      n_threads;         // number of threads to use for generation
+        int      n_threads_batch;   // number of threads to use for batch processing
 
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
@@ -851,13 +851,13 @@ extern "C" {
     // Set the number of threads used for decoding
     // n_threads is the number of threads used for generation (single token)
     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch);
 
     // Get the number of threads used for generation of a single token.
-    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+    LLAMA_API int llama_n_threads(struct llama_context * ctx);
 
     // Get the number of threads used for prompt and batch processing (multiple token).
-    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
+    LLAMA_API int llama_n_threads_batch(struct llama_context * ctx);
 
     // Set whether the model is in embeddings mode or not
     // If true, embeddings will be returned but logits will not
diff --git a/src/llama.cpp b/src/llama.cpp
index 8d3e6aaf43119..916d0f8c13484 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2373,8 +2373,8 @@ struct llama_cparams {
     uint32_t n_batch;
     uint32_t n_ubatch;
     uint32_t n_seq_max;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
+    int      n_threads;       // number of threads to use for generation
+    int      n_threads_batch; // number of threads to use for batch processing
 
     float rope_freq_base;
     float rope_freq_scale;
@@ -15530,7 +15530,7 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
               int32_t   n_tokens) {
 
     const auto & cparams = lctx.cparams;
-    int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
 
     ggml_compute_threadpool_t threadpool = nullptr;  // nullptr -> disposable threadpool
 
@@ -15665,7 +15665,7 @@ static int llama_decode_internal(
         std::pair<int32_t, ggml_compute_threadpool_t> threads =
             llama_swap_threadpools(lctx, n_tokens);
 
-        int32_t n_threads                    = threads.first;
+        int n_threads                        = threads.first;
         ggml_compute_threadpool_t threadpool = threads.second;
 
         GGML_ASSERT(n_threads > 0);
@@ -15909,7 +15909,7 @@ static int llama_encode_internal(
     std::pair<int32_t, ggml_compute_threadpool_t> threads =
         llama_swap_threadpools(lctx, n_tokens);
 
-    int32_t n_threads                    = threads.first;
+    int n_threads                        = threads.first;
     ggml_compute_threadpool_t threadpool = threads.second;
     GGML_ASSERT(n_threads > 0);
 
@@ -19448,16 +19448,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
     }
 }
 
-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
+void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch) {
     ctx->cparams.n_threads       = n_threads;
     ctx->cparams.n_threads_batch = n_threads_batch;
 }
 
-uint32_t llama_n_threads(struct llama_context * ctx) {
+int llama_n_threads(struct llama_context * ctx) {
     return ctx->cparams.n_threads;
 }
 
-uint32_t llama_n_threads_batch(struct llama_context * ctx) {
+int llama_n_threads_batch(struct llama_context * ctx) {
     return ctx->cparams.n_threads_batch;
 }
 

From c4452edfead888a0cb731d1002ec74facb9cbc71 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 12:12:48 -0700
Subject: [PATCH 24/48] threadpool: add support for
 ggml_threadpool_params_default/init

Also removes the need for explicit mask_specified param.
all-zero cpumask means use default (usually inherited) cpu affinity mask.
---
 common/common.cpp                    | 13 ++-----
 examples/llama-bench/llama-bench.cpp |  9 ++---
 ggml/include/ggml.h                  |  7 +++-
 ggml/src/ggml.c                      | 55 +++++++++++++++-------------
 4 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 2fc4d6da7a73f..56e86a07ad394 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -295,13 +295,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
         }
     }
 
-    if (n_set == 0) {
-        // You hit the jackpot!
-        memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS);
-        n_set = GGML_MAX_N_THREADS;
-    }
-
-    if (n_set < cpuparams.n_threads) {
+    if (n_set && n_set < cpuparams.n_threads) {
         // Not enough set bits, may experience performance issues.
         fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
     }
@@ -2606,16 +2600,15 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
     struct ggml_threadpool_params tpp;
 
-    tpp.mask_specified = params.mask_valid;
+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
+
     if (params.mask_valid) {
         std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
     }
 
-    tpp.n_threads  = params.n_threads;
     tpp.prio       = params.priority;
     tpp.poll       = params.poll;
     tpp.strict_cpu = params.strict_cpu;
-    tpp.paused     = false;
 
     return tpp;
 }
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 571ca6dd2eb2b..aca5f83c30147 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1462,14 +1462,13 @@ int main(int argc, char ** argv) {
 
         llama_kv_cache_clear(ctx);
 
-        struct ggml_threadpool_params tpp;
-        tpp.n_threads      = t.n_threads;
-        tpp.mask_specified = params.cpuparams.mask_valid;
+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
         tpp.strict_cpu     = params.cpuparams.strict_cpu;
         tpp.prio           = params.cpuparams.priority;
         tpp.poll           = params.cpuparams.poll;
-
-        std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+        if (params.cpuparams.mask_valid) {
+            std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+        }
 
         struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
         if (!threadpool) {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1f9e6756ef518..0accc390850b9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -626,9 +626,10 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
+    // Threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
     struct ggml_threadpool_params {
-        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
-        bool     mask_specified;              // mask is non-empty
+        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
         int      n_threads;                   // number of threads
         int32_t  prio;                        // thread priority
         uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
@@ -2025,6 +2026,8 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
+    GGML_API void                            ggml_threadpool_params_init(struct ggml_threadpool_params *p, int n_threads);
     GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
     GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
     GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2c8f5a7e35d94..0e46bcea9b94b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1987,7 +1987,6 @@ struct ggml_compute_state {
 #ifndef GGML_USE_OPENMP
     ggml_thread_t thrd;
     bool cpumask[GGML_MAX_N_THREADS];
-    bool mask_specified;
     int  last_graph;
     bool pending;
 #endif
@@ -18828,11 +18827,14 @@ static bool ggml_thread_apply_thread_priority(int32_t prio) {
 
 #endif
 
-static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
-    if (!global_mask) {
-        memset(local_mask, 1, GGML_MAX_N_THREADS);
-        return;
+static bool ggml_thread_cpumask_is_valid(const bool * mask) {
+    for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) { return true; }
     }
+    return false;
+}
+
+static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
     if (!strict) {
         memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
         return;
@@ -19189,8 +19191,10 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
     ggml_thread_apply_thread_priority(threadpool->prio);
-    if (state->mask_specified)
+
+    if (ggml_thread_cpumask_is_valid(state->cpumask)) {
         ggml_thread_apply_affinity(state->cpumask);
+    }
 
     while (true) {
         // Check if we need to sleep
@@ -19249,17 +19253,27 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo
 
 #endif // GGML_USE_OPENMP
 
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+    struct ggml_threadpool_params p;
+    ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+
 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
     if (p0->n_threads      != p1->n_threads  )    return false;
     if (p0->prio           != p1->prio       )    return false;
     if (p0->poll           != p1->poll       )    return false;
     if (p0->strict_cpu     != p1->strict_cpu )    return false;
-    if (p0->mask_specified != p1->mask_specified) return false;
-    if (p0->mask_specified) {
-        return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
-    }
-
-    return true;
+    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
 
 static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
@@ -19312,16 +19326,13 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     for (int j = 0; j < tpp->n_threads; j++) {
         workers[j] = (struct ggml_compute_state) {
             .thrd           = 0,
-            .mask_specified = tpp->mask_specified,
             .threadpool     = threadpool,
             .ith            = j,
             .last_graph     = 0,
             .pending        = false
         };
 
-        if (tpp->mask_specified) {
-            ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
-        }
+        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
 
         // Spin threads for all secondary workers
         if (j > 0) {
@@ -19357,15 +19368,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
         disposable_threadpool = true;
 
-        struct ggml_threadpool_params ttp = {
-            .mask_specified = false,
-            .n_threads      = n_threads,
-            .prio           = 0,
-            .poll           = 1,
-            .strict_cpu     = false,
-            .paused         = false
-        };
-
+        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
         threadpool = ggml_create_threadpool_impl(&ttp, cgraph, cplan);
     } else {
         // Reset some of the parameters that need resetting
@@ -19407,7 +19410,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     }
 #else
     // Update main thread affinity to match the current threadpool
-    if (threadpool->workers[0].mask_specified) {
+    if (!ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
         ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
     }
 

From 31541d74272d6e49bfff24544605c3de8466684d Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 13:55:58 -0700
Subject: [PATCH 25/48] threadpool: move typedef into ggml.h

---
 ggml/include/ggml-alloc.h | 1 -
 ggml/include/ggml.h       | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index cd85b6ee70560..0dff47d65cf86 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -9,7 +9,6 @@ extern "C" {
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
 typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
 typedef struct             ggml_backend * ggml_backend_t;
-typedef struct  ggml_compute_threadpool * ggml_compute_threadpool_t;
 
 // Tensor allocator
 struct ggml_tallocr {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 0accc390850b9..1df73d3283bec 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -639,6 +639,8 @@ extern "C" {
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
 
+    typedef struct  ggml_compute_threadpool * ggml_compute_threadpool_t;
+
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {

From 40648601f15ab4f2d84e9a20a35ef6bb432ecb58 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 14:15:22 -0700
Subject: [PATCH 26/48] threadpool: fix apply_priority() function name

---
 ggml/src/ggml.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 0e46bcea9b94b..839aa3f5796fc 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18736,7 +18736,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
     return true;
 }
 
-static bool ggml_thread_apply_process_prio(int32_t prio) {
+static bool ggml_thread_apply_process_priority(int32_t prio) {
     int32_t p = 0;
 
     switch (prio) {
@@ -18786,7 +18786,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
     return true;
 }
 
-static bool ggml_thread_apply_process_prio(int32_t prio) {
+static bool ggml_thread_apply_process_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
 
@@ -19320,7 +19320,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
 #else  // Not using OPENMP
     int32_t cpumask_iter = 0;
 
-    ggml_thread_apply_process_prio(tpp->prio);
+    ggml_thread_apply_process_priority(tpp->prio);
     ggml_thread_apply_thread_priority(tpp->prio);
 
     for (int j = 0; j < tpp->n_threads; j++) {

From f64c975168382db2ce499edd94ea4c171c64bca6 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 15:07:54 -0700
Subject: [PATCH 27/48] threadpool: fix swift wrapper errors due to n_threads
 int type cleanup

---
 examples/llama.swiftui/llama.cpp.swift/LibLlama.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 58c32ca533bb1..48b7840ae49c3 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -71,8 +71,8 @@ actor LlamaContext {
         var ctx_params = llama_context_default_params()
         ctx_params.seed  = 1234
         ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
+        ctx_params.n_threads       = Int32(n_threads)
+        ctx_params.n_threads_batch = Int32(n_threads)
 
         let context = llama_new_context_with_model(model, ctx_params)
         guard let context else {

From c506d7fc46ed2bc80c20455c0f9f1b09c7b47669 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 15:23:51 -0700
Subject: [PATCH 28/48] threadpool: enable --cpu-mask and other threadpool
 related options only if threadpool is enabled

---
 common/common.cpp | 57 +++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 56e86a07ad394..faaea88cd4b3c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1707,39 +1707,38 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
     options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
     options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
-    options.push_back({ "*",           "-C,    --cpu-mask M",           "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
-    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
-    options.push_back({ "*",           "       --cpu-strict <0|1>",     "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
-    options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll <0...100>",       "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
-    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
-    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
-                                                                        "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
-    options.push_back({ "*",           "       --cpu-strict-batch <0|1>",
-                                                                        "use strict CPU placement (default: same as --cpu-strict)"});
-    options.push_back({ "*",           "       --priority-batch N",     "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
-    options.push_back({ "*",           "       --poll-batch <0|1>",     "use polling to wait for work (default: same as --poll"});
     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
-    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",     "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
-    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi",
-                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
-    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>",
-                                                                        "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
-    options.push_back({ "speculative", "       --priority-draft N",     "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
-    options.push_back({ "speculative", "       --poll-draft <0|1>",     "Use polling to wait for draft model work (default: same as --poll])"});
-    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
-                                                                        "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
-    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M",
-                                                                        "Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
+    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+
+#ifndef GGML_USE_OPENMP
+    // these options are available only with the internal threadpool
+    options.push_back({ "*",           "-C,    --cpu-mask M",            "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",       "range of CPUs for affinity. Complements --cpu-mask"});
+    options.push_back({ "*",           "       --cpu-strict <0|1>",      "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --priority N",            "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
+    options.push_back({ "*",           "       --poll <0...100>",        "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
+
+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",      "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
+    options.push_back({ "*",           "       --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --priority-batch N",      "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
+    options.push_back({ "*",           "       --poll-batch <0|1>",      "use polling to wait for work (default: same as --poll"});
+
+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",      "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
+    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --priority-draft N",      "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
+    options.push_back({ "speculative", "       --poll-draft <0|1>",      "Use polling to wait for draft model work (default: same as --poll])"});
+
+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
     options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
-                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
+                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
     options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
-                                                                        "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
-    options.push_back({ "speculative", "       --priority-batch-draft N",
-                                                                        "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
-    options.push_back({ "speculative", "       --poll-batch-draft <0|1>",
-                                                                        "Use polling to wait for draft model work (default: --poll-draft)"});
+                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
+    options.push_back({ "speculative", "       --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
+#endif // GGML_USE_OPENMP
 
     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });

From 8008463aeea5fcea89e8bc39351cb297fc749b28 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 15:36:02 -0700
Subject: [PATCH 29/48] threadpool: replace checks for compute_thread ret code
 with proper status check

---
 ggml/src/ggml.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 839aa3f5796fc..f0a70138f4fe7 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19218,13 +19218,9 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
         if (state->pending) {
             state->pending = false;
 
-            int64_t ret = (int64_t) ggml_graph_compute_thread(state);
-            if (ret == GGML_EXIT_ABORTED)
-                return (thread_ret_t) ret;
-
-            if (ret != GGML_EXIT_SUCCESS && ret != GGML_EXIT_ABORTED) {
-                fprintf(stderr, "ggml_graph_compute_thread exited with an unexpected error: %lld\n", (long long int) ret);
-                GGML_ASSERT(false);
+            ggml_graph_compute_thread(state);
+            if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
+                break;
             }
         }
     }

From 49ac51f2a314a3e2334f756e4bce84cff07a40bb Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 17:35:34 -0700
Subject: [PATCH 30/48] threadpool: simplify threadpool init logic and fix main
 thread affinity application

Most of the init code is now exactly the same between threadpool and openmp.
---
 ggml/src/ggml.c | 73 +++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f0a70138f4fe7..b8a6ec7385056 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19191,7 +19191,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
     ggml_thread_apply_thread_priority(threadpool->prio);
-
     if (ggml_thread_cpumask_is_valid(state->cpumask)) {
         ggml_thread_apply_affinity(state->cpumask);
     }
@@ -19296,51 +19295,35 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         threadpool->ec               = GGML_STATUS_SUCCESS;
     }
 
-#ifndef GGML_USE_OPENMP
-    ggml_mutex_init(&threadpool->mutex);
-    ggml_cond_init(&threadpool->cond);
-#endif // GGML_USE_OPENMP
+    // Allocate and init workers state
+    const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
+    struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
 
-    struct ggml_compute_state * workers =
-        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_state) * tpp->n_threads);
+    memset(workers, 0, workers_size);
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j].threadpool = threadpool;
+        workers[j].ith        = j;
+    }
 
     threadpool->workers = workers;
 
-#ifdef GGML_USE_OPENMP
-    for (int j = 0; j < tpp->n_threads; j++) {
-        workers[j] = (struct ggml_compute_state) {
-            .threadpool     = threadpool,
-            .ith            = j
-        };
-    }
-#else  // Not using OPENMP
-    int32_t cpumask_iter = 0;
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_init(&threadpool->mutex);
+    ggml_cond_init(&threadpool->cond);
 
-    ggml_thread_apply_process_priority(tpp->prio);
-    ggml_thread_apply_thread_priority(tpp->prio);
+    // Spin the threads for all workers, and update CPU placements.
+    // Place the main thread last (towards the higher numbered CPU cores).
 
-    for (int j = 0; j < tpp->n_threads; j++) {
-        workers[j] = (struct ggml_compute_state) {
-            .thrd           = 0,
-            .threadpool     = threadpool,
-            .ith            = j,
-            .last_graph     = 0,
-            .pending        = false
-        };
+    int32_t cpumask_iter = 0;
 
+    for (int j = 1; j < tpp->n_threads; j++) {
         ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
 
-        // Spin threads for all secondary workers
-        if (j > 0) {
-            int32_t rc = ggml_thread_create(
-                &workers[j].thrd,
-                NULL,
-                ggml_graph_compute_secondary_thread,
-                &workers[j]
-            );
-            GGML_ASSERT(rc == 0);
-        }
+        int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
+        GGML_ASSERT(rc == 0);
     }
+
+    ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
 #endif // GGML_USE_OPENMP
 
     return threadpool;
@@ -19391,22 +19374,16 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
                 threadpool->n_threads_cur = n_threads;
             }
 
-            struct ggml_compute_state worker = {
-                .ith        = omp_get_thread_num(),
-                .threadpool = threadpool,
-            };
-            ggml_graph_compute_thread(&worker);
+            ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
         }
     } else {
-        struct ggml_compute_state worker = {
-            .ith        = 0,
-            .threadpool = threadpool,
-        };
-        ggml_graph_compute_thread(&worker);
+        ggml_graph_compute_thread(&threadpool->workers[0]);
     }
 #else
-    // Update main thread affinity to match the current threadpool
-    if (!ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+    // Update main thread prio and affinity to match the current threadpool
+    ggml_thread_apply_process_priority(threadpool->prio);
+    ggml_thread_apply_thread_priority(threadpool->prio);
+    if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
         ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
     }
 

From 204377a0a8ebdaf54aac588e456284f557b5eea3 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 18:03:06 -0700
Subject: [PATCH 31/48] threadpool: update threadpool resume/pause function
 names

---
 ggml/src/ggml.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b8a6ec7385056..356a722173916 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18888,13 +18888,13 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
 
 #ifndef GGML_USE_OPENMP
 // pause/resume must be called under mutex
-static void __ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+static void ggml_pause_threadpool_locked(struct ggml_compute_threadpool * threadpool) {
     GGML_PRINT_DEBUG("Pausing threadpool\n");
     threadpool->pause = true;
     ggml_cond_broadcast(&threadpool->cond);
 }
 
-static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+static void ggml_resume_threadpool_locked(struct ggml_compute_threadpool * threadpool) {
     GGML_PRINT_DEBUG("Resuming threadpool\n");
     threadpool->pause = false;
     ggml_cond_broadcast(&threadpool->cond);
@@ -18905,7 +18905,7 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     ggml_mutex_lock(&threadpool->mutex);
     if (!threadpool->pause) {
-       __ggml_pause_threadpool(threadpool);
+       ggml_pause_threadpool_locked(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -18917,7 +18917,7 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     ggml_mutex_lock(&threadpool->mutex);
     if (threadpool->pause) {
-       __ggml_resume_threadpool(threadpool);
+       ggml_resume_threadpool_locked(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -19238,7 +19238,7 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo
 
     if (threadpool->pause) {
        // resume does cond broadcast
-       __ggml_resume_threadpool(threadpool);
+       ggml_resume_threadpool_locked(threadpool);
     } else {
        ggml_cond_broadcast(&threadpool->cond);
     }

From 93f170d8688a3244c5f7f8a0a8596745b6543168 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 18:26:37 -0700
Subject: [PATCH 32/48] threadpool: enable openmp by default for now

---
 ggml/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index a5a8c5d8c2887..cc16858849783 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -146,7 +146,7 @@ option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"
 set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "ggml: metal minimum macOS version")
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
-option(GGML_OPENMP                          "ggml: use OpenMP"                                OFF)
+option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)

From a7496bf7e5c276a3d06521fd0951f77cb7af9275 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 24 Aug 2024 19:05:54 -0700
Subject: [PATCH 33/48] threadpool: don't forget to free workers state when omp
 is enabled

---
 ggml/src/ggml.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 356a722173916..81c3afa180117 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18877,12 +18877,11 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
         UNUSED(rc);
     }
 
-    GGML_ALIGNED_FREE(workers);
-
     ggml_mutex_destroy(&threadpool->mutex);
     ggml_cond_destroy(&threadpool->cond);
 #endif // GGML_USE_OPENMP
 
+    GGML_ALIGNED_FREE(threadpool->workers);
     GGML_ALIGNED_FREE(threadpool);
 }
 

From 8186e9615f07708497d794bb5949f13c3c7ba851 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 26 Aug 2024 12:25:48 -0700
Subject: [PATCH 34/48] threadpool: avoid updating process priority on the
 platforms that do not require it

On Windows we need to change overall process priority class in order to set thread priorities,
but on Linux, Mac, etc we do not need to touch the overall process settings.
---
 ggml/src/ggml.c | 84 ++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 81c3afa180117..0aeab2a9cb974 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18700,9 +18700,16 @@ static bool ggml_thread_apply_affinity(bool * mask) {
     return m != 0;
 }
 
-static bool ggml_thread_apply_process_priority(int32_t prio) {
+static bool ggml_thread_apply_thread_priority(int32_t prio) {
     DWORD p = NORMAL_PRIORITY_CLASS;
 
+    if (prio == SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    // On Windows we have to update Process Priority Class in order to set Thread priority.
+
     switch (prio) {
         case SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
         case SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
@@ -18710,11 +18717,10 @@ static bool ggml_thread_apply_process_priority(int32_t prio) {
         case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
     }
 
-    return SetPriorityClass(GetCurrentProcess(), p);
-}
-
-static bool ggml_thread_apply_thread_priority(int32_t prio) {
-    DWORD p = NORMAL_PRIORITY_CLASS;
+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
+        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
 
     switch (prio) {
         case SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
@@ -18723,8 +18729,12 @@ static bool ggml_thread_apply_thread_priority(int32_t prio) {
         case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
     }
 
-    return SetThreadPriority(GetCurrentThread(), p);
+    if (!SetThreadPriority(GetCurrentThread(), p)) {
+        fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
 
+    return true;
 }
 
 #elif defined(__APPLE__)
@@ -18732,26 +18742,32 @@ static bool ggml_thread_apply_thread_priority(int32_t prio) {
 #include <sys/resource.h>
 
 static bool ggml_thread_apply_affinity(const bool * mask) {
+    // Not supported on Apple platforms
     UNUSED(mask);
     return true;
 }
 
-static bool ggml_thread_apply_process_priority(int32_t prio) {
-    int32_t p = 0;
-
+static bool ggml_thread_apply_thread_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
     switch (prio) {
-        case SCHED_PRIO_NORMAL:   p =  0;  break;
-        case SCHED_PRIO_MEDIUM:   p = -5;  break;
-        case SCHED_PRIO_HIGH:     p = -10; break;
-        case SCHED_PRIO_REALTIME: p = -20; break;
+        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
     }
 
-    int32_t r = setpriority(PRIO_PROCESS, 0, p);
-    return r != -1;
-}
+    if (prio == SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
 
-static bool ggml_thread_apply_thread_priority(int32_t prio) {
-    UNUSED(prio);
     return true;
 }
 
@@ -18759,7 +18775,7 @@ static bool ggml_thread_apply_thread_priority(int32_t prio) {
 
 static bool ggml_thread_apply_affinity(const bool * mask) {
     cpu_set_t cpuset;
-    int32_t err;
+    int err;
 
     CPU_ZERO(&cpuset);
 
@@ -18779,17 +18795,16 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
     err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
 #endif
     if (err != 0) {
-        //fprintf(stderr, "warn: failed to set affinity mask 0x%llx (err %d: %s)\n", (unsigned long long)mask, err, strerror(err));
+        fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
         return false;
     }
 
     return true;
 }
 
-static bool ggml_thread_apply_process_priority(int32_t prio) {
+static bool ggml_thread_apply_thread_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
-
     switch (prio) {
         case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
         case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
@@ -18797,28 +18812,14 @@ static bool ggml_thread_apply_process_priority(int32_t prio) {
         case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
     }
 
-    int32_t err = sched_setscheduler(0, policy, &p);
-    if (err != 0) {
-        //fprintf(stderr, "warn: failed to set process priority %d (err %d)\n", prio, err);
-        return false;
-    }
-
-    return true;
-}
-
-static bool ggml_thread_apply_thread_priority(int32_t prio) {
-    struct sched_param p;
-    int32_t policy = SCHED_OTHER;
-    switch (prio) {
-        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
-        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
-        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
-        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    if (prio == SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
     }
 
     int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
     if (err != 0) {
-        //fprintf(stderr, "warn: failed to set thread priority %d (err %d)\n", prio, err);
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
         return false;
     }
 
@@ -19380,7 +19381,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     }
 #else
     // Update main thread prio and affinity to match the current threadpool
-    ggml_thread_apply_process_priority(threadpool->prio);
     ggml_thread_apply_thread_priority(threadpool->prio);
     if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
         ggml_thread_apply_affinity(threadpool->workers[0].cpumask);

From 658f16c330282b1143181093aeabd9fc0daf01e5 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 26 Aug 2024 13:10:11 -0700
Subject: [PATCH 35/48] threadpool: update calling thread prio and affinity
 only at start/resume

This avoids extra syscalls for each graph_compute()
---
 ggml/src/ggml.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 0aeab2a9cb974..030e26104ebf3 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19237,6 +19237,12 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo
     atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
 
     if (threadpool->pause) {
+       // Update main thread prio and affinity to match the threadpool settings
+       ggml_thread_apply_thread_priority(threadpool->prio);
+       if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+           ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+       }
+
        // resume does cond broadcast
        ggml_resume_threadpool_locked(threadpool);
     } else {
@@ -19324,6 +19330,14 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     }
 
     ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+    if (!threadpool->pause) {
+        // Update main thread prio and affinity at the start, otherwise we'll do it in resume
+        ggml_thread_apply_thread_priority(threadpool->prio);
+        if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+            ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+        }
+    }
 #endif // GGML_USE_OPENMP
 
     return threadpool;
@@ -19380,12 +19394,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         ggml_graph_compute_thread(&threadpool->workers[0]);
     }
 #else
-    // Update main thread prio and affinity to match the current threadpool
-    ggml_thread_apply_thread_priority(threadpool->prio);
-    if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
-        ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
-    }
-
     // Kick all threads to start the new graph
     ggml_graph_compute_kickoff(threadpool);
 

From 8d5ab9a58ea3cc9af41b25814256254e01dda218 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 26 Aug 2024 17:07:36 -0700
Subject: [PATCH 36/48] llama-bench: turn threadpool params into vectors, add
 output headers, etc

---
 examples/llama-bench/llama-bench.cpp | 133 ++++++++++++++++++---------
 1 file changed, 88 insertions(+), 45 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index aca5f83c30147..9d1ffb9b2ae22 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -225,6 +225,9 @@ struct cmd_params {
     std::vector<ggml_type> type_k;
     std::vector<ggml_type> type_v;
     std::vector<int> n_threads;
+    std::vector<std::string> cpu_mask;
+    std::vector<bool> cpu_strict;
+    std::vector<int> poll;
     std::vector<int> n_gpu_layers;
     std::vector<std::string> rpc_servers;
     std::vector<llama_split_mode> split_mode;
@@ -235,8 +238,8 @@ struct cmd_params {
     std::vector<bool> use_mmap;
     std::vector<bool> embeddings;
     ggml_numa_strategy numa;
-    cpu_params cpuparams;
     int reps;
+    int prio;
     bool verbose;
     output_formats output_format;
     output_formats output_format_stderr;
@@ -252,6 +255,9 @@ static const cmd_params cmd_params_defaults = {
     /* type_k               */ {GGML_TYPE_F16},
     /* type_v               */ {GGML_TYPE_F16},
     /* n_threads            */ {cpu_get_num_math()},
+    /* cpu_mask             */ {"0x0"},
+    /* cpu_strict           */ {false},
+    /* poll                 */ {50},
     /* n_gpu_layers         */ {99},
     /* rpc_servers          */ {""},
     /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
@@ -262,8 +268,8 @@ static const cmd_params cmd_params_defaults = {
     /* use_mmap             */ {true},
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
-    /* cpuparams            */ {},
     /* reps                 */ 5,
+    /* prio                 */ 0,
     /* verbose              */ false,
     /* output_format        */ MARKDOWN,
     /* output_format_stderr */ NONE,
@@ -283,6 +289,9 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
     printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
     printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>            (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                  (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                    (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
     printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
     printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
     printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -291,13 +300,10 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
-    printf("  -C, --cpu-mask <hex>                (default: 0x0)\n");
-    printf("  --cpu-strict <0|1>                  (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
-    printf("  --priority <0|1|2|3>                (default: %d)\n", cmd_params_defaults.cpuparams.priority);
-    printf("  --poll <0|1>                        (default: %d)\n", cmd_params_defaults.cpuparams.poll);
     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
+    printf("  --prio <0|1|2|3>                    (default: %d)\n", cmd_params_defaults.prio);
     printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
     printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
     printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -344,6 +350,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     params.output_format_stderr = cmd_params_defaults.output_format_stderr;
     params.reps = cmd_params_defaults.reps;
     params.numa = cmd_params_defaults.numa;
+    params.prio = cmd_params_defaults.prio;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -439,6 +446,33 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = string_split<int>(argv[i], split_delim);
             params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+        } else if (arg == "--cpu-strict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+        } else if (arg == "--poll") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.poll.insert(params.poll.end(), p.begin(), p.end());
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prio = std::stoi(argv[i]);
         } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -498,32 +532,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
                 else { invalid_param = true; break; }
             }
-        } else if (arg == "-C" || arg == "--cpu-mask") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string mask = argv[i];
-            params.cpuparams.mask_valid = true;
-            invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
-        } else if (arg == "--prio") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cpuparams.priority = std::stoul(argv[i]);
-        } else if (arg == "--cpu-strict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cpuparams.strict_cpu = std::stoul(argv[i]);
-        } else if (arg == "--poll") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cpuparams.poll = std::stoul(argv[i]);
         } else if (arg == "-fa" || arg == "--flash-attn") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -617,6 +625,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
     if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
     if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+    if (params.cpu_mask.empty())     { params.cpu_mask  = cmd_params_defaults.cpu_mask;  }
+    if (params.cpu_strict.empty())   { params.cpu_strict = cmd_params_defaults.cpu_strict; }
+    if (params.poll.empty())         { params.poll = cmd_params_defaults.poll; }
 
     return params;
 }
@@ -630,6 +641,9 @@ struct cmd_params_instance {
     ggml_type type_k;
     ggml_type type_v;
     int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
     int n_gpu_layers;
     std::string rpc_servers;
     llama_split_mode split_mode;
@@ -699,7 +713,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & tv : params.type_v)
     for (const auto & nkvo : params.no_kv_offload)
     for (const auto & fa : params.flash_attn)
-    for (const auto & nt : params.n_threads) {
+    for (const auto & nt : params.n_threads)
+    for (const auto & cm : params.cpu_mask)
+    for (const auto & cs : params.cpu_strict)
+    for (const auto & pl : params.poll) {
         for (const auto & n_prompt : params.n_prompt) {
             if (n_prompt == 0) {
                 continue;
@@ -713,6 +730,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
                 /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
@@ -739,6 +759,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
                 /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
@@ -765,6 +788,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
                 /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
@@ -801,6 +827,9 @@ struct test {
     int n_batch;
     int n_ubatch;
     int n_threads;
+    std::string cpu_mask;
+    bool cpu_strict;
+    int poll;
     bool has_rpc;
     ggml_type type_k;
     ggml_type type_v;
@@ -827,6 +856,9 @@ struct test {
         n_batch = inst.n_batch;
         n_ubatch = inst.n_ubatch;
         n_threads = inst.n_threads;
+        cpu_mask = inst.cpu_mask;
+        cpu_strict = inst.cpu_strict;
+        poll = inst.poll;
         has_rpc = !inst.rpc_servers.empty();
         type_k = inst.type_k;
         type_v = inst.type_v;
@@ -904,13 +936,14 @@ struct test {
             "cpu_info", "gpu_info",
             "model_filename", "model_type", "model_size", "model_n_params",
             "n_batch", "n_ubatch",
-            "n_threads", "type_k", "type_v",
+            "n_threads", "cpu_mask", "cpu_strict", "poll",
+            "type_k", "type_v",
             "n_gpu_layers", "split_mode",
             "main_gpu", "no_kv_offload", "flash_attn",
             "tensor_split", "use_mmap", "embeddings",
             "n_prompt", "n_gen", "test_time",
             "avg_ns", "stddev_ns",
-            "avg_ts", "stddev_ts"
+            "avg_ts", "stddev_ts",
         };
         return fields;
     }
@@ -919,7 +952,7 @@ struct test {
 
     static field_type get_field_type(const std::string & field) {
         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
-            field == "n_threads" ||
+            field == "n_threads" || field == "poll" ||
             field == "model_size" || field == "model_n_params" ||
             field == "n_gpu_layers" || field == "main_gpu" ||
             field == "n_prompt" || field == "n_gen" ||
@@ -928,6 +961,7 @@ struct test {
         }
         if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
             field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+            field == "cpu_strict" ||
             field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
@@ -960,7 +994,8 @@ struct test {
             cpu_info, gpu_info,
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
             std::to_string(n_batch), std::to_string(n_ubatch),
-            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+            std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
+            ggml_type_name(type_k), ggml_type_name(type_v),
             std::to_string(n_gpu_layers), split_mode_str(split_mode),
             std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
             tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -1099,7 +1134,7 @@ struct markdown_printer : public printer {
             return -30;
         }
         if (field == "t/s") {
-            return 16;
+            return 20;
         }
         if (field == "size" || field == "params") {
             return 10;
@@ -1181,6 +1216,15 @@ struct markdown_printer : public printer {
         if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
             fields.emplace_back("n_threads");
         }
+        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
+            fields.emplace_back("cpu_mask");
+        }
+        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
+            fields.emplace_back("cpu_strict");
+        }
+        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
+            fields.emplace_back("poll");
+        }
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
             fields.emplace_back("n_batch");
         }
@@ -1434,8 +1478,6 @@ int main(int argc, char ** argv) {
     llama_model * lmodel = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
-    postprocess_cpu_params(params.cpuparams);
-
     for (const auto & inst : params_instances) {
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1463,12 +1505,13 @@ int main(int argc, char ** argv) {
         llama_kv_cache_clear(ctx);
 
         struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
-        tpp.strict_cpu     = params.cpuparams.strict_cpu;
-        tpp.prio           = params.cpuparams.priority;
-        tpp.poll           = params.cpuparams.poll;
-        if (params.cpuparams.mask_valid) {
-            std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
+            LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            exit(1);
         }
+        tpp.strict_cpu     = t.cpu_strict;
+        tpp.poll           = t.poll;
+        tpp.prio           = params.prio;
 
         struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
         if (!threadpool) {

From 3bcc4dee9ad97adc95feba5e91b1f9d11f77a7ce Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 27 Aug 2024 10:59:07 -0700
Subject: [PATCH 37/48] llama-bench: add support for cool off between tests
 --delay

This helps for long running tests on platforms that are thermally limited (phones, laptops, etc).
--delay (disabled by default) introduces the sleep for N seconds before starting each test.
---
 examples/llama-bench/llama-bench.cpp | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 9d1ffb9b2ae22..a0cbc2ae9a961 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -16,6 +16,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <thread>
 
 #include "ggml.h"
 #include "llama.h"
@@ -240,6 +241,7 @@ struct cmd_params {
     ggml_numa_strategy numa;
     int reps;
     int prio;
+    int delay;
     bool verbose;
     output_formats output_format;
     output_formats output_format_stderr;
@@ -270,6 +272,7 @@ static const cmd_params cmd_params_defaults = {
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
     /* reps                 */ 5,
     /* prio                 */ 0,
+    /* delay                */ 0,
     /* verbose              */ false,
     /* output_format        */ MARKDOWN,
     /* output_format_stderr */ NONE,
@@ -304,6 +307,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
     printf("  --prio <0|1|2|3>                    (default: %d)\n", cmd_params_defaults.prio);
+    printf("  --delay <0...N>                     (default: %d)\n", cmd_params_defaults.delay);
     printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
     printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
     printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -351,6 +355,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     params.reps = cmd_params_defaults.reps;
     params.numa = cmd_params_defaults.numa;
     params.prio = cmd_params_defaults.prio;
+    params.delay = cmd_params_defaults.delay;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -467,12 +472,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = string_split<int>(argv[i], split_delim);
             params.poll.insert(params.poll.end(), p.begin(), p.end());
-        } else if (arg == "--prio") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prio = std::stoi(argv[i]);
         } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -581,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 break;
             }
             params.reps = std::stoi(argv[i]);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prio = std::stoi(argv[i]);
+        } else if (arg == "--delay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.delay = std::stoi(argv[i]);
         } else if (arg == "-o" || arg == "--output") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1504,6 +1515,11 @@ int main(int argc, char ** argv) {
 
         llama_kv_cache_clear(ctx);
 
+        // cool off before the test
+        if (params.delay) {
+            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
+        }
+
         struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
         if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
             LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());

From 5d4c0a132707797c809e4d750f7fa6c5c833d5c0 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 27 Aug 2024 16:31:34 -0700
Subject: [PATCH 38/48] threadpool: move process priority setting into the apps
 (bench and cli)

This avoids changing the overall process priority on Windows for the apps
that use ggml/llama.cpp directy.
---
 common/common.cpp                    | 59 ++++++++++++++++++++--
 common/common.h                      |  3 +-
 examples/llama-bench/llama-bench.cpp | 14 +++---
 examples/main/main.cpp               |  2 +
 ggml/include/ggml.h                  | 20 +++++---
 ggml/src/ggml.c                      | 74 ++++++++++------------------
 6 files changed, 108 insertions(+), 64 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index faaea88cd4b3c..9191ade718c1f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
     return cpu_get_num_physical_cores();
 }
 
+// Helper for setting process priority
+
+#if defined(_WIN32)
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    DWORD p = NORMAL_PRIORITY_CLASS;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+
+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
+        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+
+    return true;
+}
+
+#else // MacOS and POSIX
+#include <sys/types.h>
+#include <sys/resource.h>
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    int32_t p = 0;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
+    }
+
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
+        fprintf(stderr, "warn: failed to set process priority class %d : %s (%d)\n", prio, strerror(errno), errno);
+        return false;
+    }
+    return true;
+}
+
+#endif
+
 //
 // CLI argument parsing
 //
@@ -508,7 +559,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--prio") {
         CHECK_ARG
-        params.cpuparams.priority = std::stoul(argv[i]);
+        params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
         return true;
     }
     if (arg == "--cpu-strict") {
@@ -545,7 +596,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--prio-batch") {
         CHECK_ARG
-        params.cpuparams_batch.priority = std::stoul(argv[i]);
+        params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
         return true;
     }
     if (arg == "--cpu-strict-batch") {
@@ -581,7 +632,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--prio-draft") {
         CHECK_ARG
-        params.draft_cpuparams.priority = std::stoul(argv[i]);
+        params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
         return true;
     }
     if (arg == "--cpu-strict-draft") {
@@ -610,7 +661,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--prio-batch-draft") {
         CHECK_ARG
-        params.draft_cpuparams_batch.priority = std::stoul(argv[i]);
+        params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
         return true;
     }
     if (arg == "--cpu-strict-batch-draft") {
diff --git a/common/common.h b/common/common.h
index a665716bee133..cb5e7f6df10c5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -71,7 +71,7 @@ struct cpu_params {
     int      n_threads                   = -1;
     bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
     bool     mask_valid                  = false;   // Default: any CPU
-    int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
     uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
@@ -290,6 +290,7 @@ std::string gpt_params_get_system_info(const gpt_params & params);
 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
 void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+bool set_process_priority(enum ggml_sched_priority prio);
 
 //
 // String utils
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index a0cbc2ae9a961..b201013534d2f 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -240,7 +240,7 @@ struct cmd_params {
     std::vector<bool> embeddings;
     ggml_numa_strategy numa;
     int reps;
-    int prio;
+    ggml_sched_priority prio;
     int delay;
     bool verbose;
     output_formats output_format;
@@ -271,7 +271,7 @@ static const cmd_params cmd_params_defaults = {
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
     /* reps                 */ 5,
-    /* prio                 */ 0,
+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
     /* delay                */ 0,
     /* verbose              */ false,
     /* output_format        */ MARKDOWN,
@@ -585,7 +585,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            params.prio = std::stoi(argv[i]);
+            params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
         } else if (arg == "--delay") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1470,6 +1470,8 @@ int main(int argc, char ** argv) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
+    set_process_priority(params.prio);
+
     // initialize printer
     std::unique_ptr<printer> p = create_printer(params.output_format);
     std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@@ -1525,9 +1527,9 @@ int main(int argc, char ** argv) {
             LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
             exit(1);
         }
-        tpp.strict_cpu     = t.cpu_strict;
-        tpp.poll           = t.poll;
-        tpp.prio           = params.prio;
+        tpp.strict_cpu = t.cpu_strict;
+        tpp.poll       = t.poll;
+        tpp.prio       = params.prio;
 
         struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
         if (!threadpool) {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index a64c1bc258f59..bdaf0dbb6f7e1 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -230,6 +230,8 @@ int main(int argc, char ** argv) {
     struct ggml_threadpool_params tpp =
             ggml_threadpool_params_from_cpu_params(params.cpuparams);
 
+    set_process_priority(params.cpuparams.priority);
+
     struct ggml_compute_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
         threadpool_batch = ggml_create_threadpool(&tpp_batch);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1df73d3283bec..1ced22eec79d9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -626,15 +626,23 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
+    // Scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
     // Threadpool params
     // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
     struct ggml_threadpool_params {
-        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
-        int      n_threads;                   // number of threads
-        int32_t  prio;                        // thread priority
-        uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
-        bool     strict_cpu;                  // strict cpu placement
-        bool     paused;                      // start in paused state
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
     };
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 030e26104ebf3..dd08b77f8457e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18655,18 +18655,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
 
-enum {
-    SCHED_PRIO_NORMAL,
-    SCHED_PRIO_MEDIUM,
-    SCHED_PRIO_HIGH,
-    SCHED_PRIO_REALTIME
-};
-
 #if defined(_WIN32)
 #include "windows.h"
 
 // TODO: support > 64 CPUs
-static bool ggml_thread_apply_affinity(bool * mask) {
+bool ggml_thread_apply_affinity(bool * mask) {
     HANDLE    h = GetCurrentThread();
     uint64_t  bitmask = 0ULL;
 
@@ -18700,33 +18693,20 @@ static bool ggml_thread_apply_affinity(bool * mask) {
     return m != 0;
 }
 
-static bool ggml_thread_apply_thread_priority(int32_t prio) {
-    DWORD p = NORMAL_PRIORITY_CLASS;
-
-    if (prio == SCHED_PRIO_NORMAL) {
-        // Keep inherited policy/priority
-        return true;
-    }
-
-    // On Windows we have to update Process Priority Class in order to set Thread priority.
-
+static bool ggml_thread_apply_priority(int32_t prio) {
+    // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
+    // This is up to the applications.
+    DWORD p = THREAD_PRIORITY_NORMAL;
     switch (prio) {
-        case SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
-        case SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
-        case SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
-        case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
-    }
-
-    if (!SetPriorityClass(GetCurrentProcess(), p)) {
-        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
-        return false;
+        case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
     }
 
-    switch (prio) {
-        case SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
-        case SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
-        case SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
-        case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
     }
 
     if (!SetThreadPriority(GetCurrentThread(), p)) {
@@ -18747,17 +18727,17 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
     return true;
 }
 
-static bool ggml_thread_apply_thread_priority(int32_t prio) {
+static bool ggml_thread_apply_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
     switch (prio) {
-        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
-        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
-        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
-        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
     }
 
-    if (prio == SCHED_PRIO_NORMAL) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
         // Keep inherited policy/priority
         return true;
     }
@@ -18802,17 +18782,17 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
     return true;
 }
 
-static bool ggml_thread_apply_thread_priority(int32_t prio) {
+static bool ggml_thread_apply_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
     switch (prio) {
-        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
-        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
-        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
-        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
     }
 
-    if (prio == SCHED_PRIO_NORMAL) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
         // Keep inherited policy/priority
         return true;
     }
@@ -19190,7 +19170,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    ggml_thread_apply_thread_priority(threadpool->prio);
+    ggml_thread_apply_priority(threadpool->prio);
     if (ggml_thread_cpumask_is_valid(state->cpumask)) {
         ggml_thread_apply_affinity(state->cpumask);
     }
@@ -19238,7 +19218,7 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo
 
     if (threadpool->pause) {
        // Update main thread prio and affinity to match the threadpool settings
-       ggml_thread_apply_thread_priority(threadpool->prio);
+       ggml_thread_apply_priority(threadpool->prio);
        if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
            ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
        }
@@ -19333,7 +19313,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
 
     if (!threadpool->pause) {
         // Update main thread prio and affinity at the start, otherwise we'll do it in resume
-        ggml_thread_apply_thread_priority(threadpool->prio);
+        ggml_thread_apply_priority(threadpool->prio);
         if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
             ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
         }

From e3c2202049b771e60732060536dd753524c59e49 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 27 Aug 2024 13:19:45 -0700
Subject: [PATCH 39/48] threadpool: move all pause/resume logic into ggml

---
 examples/llama-bench/llama-bench.cpp |  2 +-
 examples/main/main.cpp               |  9 +---
 ggml/src/ggml-backend.c              |  5 ++
 ggml/src/ggml.c                      |  3 --
 include/llama.h                      | 11 +---
 src/llama.cpp                        | 77 ++++------------------------
 6 files changed, 19 insertions(+), 88 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index b201013534d2f..b6b1efe0295a7 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1537,7 +1537,7 @@ int main(int argc, char ** argv) {
             exit(1);
         }
 
-        llama_attach_threadpool(ctx, threadpool);
+        llama_attach_threadpool(ctx, threadpool, NULL);
 
         // warmup run
         if (t.n_prompt > 0) {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index bdaf0dbb6f7e1..0ccd0558fc94a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -240,11 +240,6 @@ int main(int argc, char ** argv) {
             exit(1);
         }
 
-        llama_attach_batch_threadpool(ctx, threadpool_batch);
-        if (ctx_guidance) {
-            llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
-        }
-
         // Start the non-batch threadpool in the paused state
         tpp.paused = true;
     }
@@ -255,9 +250,9 @@ int main(int argc, char ** argv) {
         exit(1);
     }
 
-    llama_attach_threadpool(ctx, threadpool);
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
     if (ctx_guidance) {
-        llama_attach_threadpool(ctx_guidance, threadpool);
+        llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 826b99ac01ace..03e41a09cb7c9 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -910,6 +910,11 @@ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_th
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
     struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+       // already had a different threadpool, pause/suspend it before switching
+       ggml_pause_threadpool(ctx->threadpool);
+    }
     ctx->threadpool = threadpool;
 }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index dd08b77f8457e..f05f89a275d4e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19198,9 +19198,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
             state->pending = false;
 
             ggml_graph_compute_thread(state);
-            if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
-                break;
-            }
         }
     }
 
diff --git a/include/llama.h b/include/llama.h
index 7b103261d3982..c03c4929b4e30 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -431,16 +431,9 @@ extern "C" {
     // Optional: an auto threadpool gets created in ggml if not passed explicitly
     LLAMA_API void llama_attach_threadpool(
                struct   llama_context * ctx,
-            ggml_compute_threadpool_t   threadpool);
-    LLAMA_API void llama_attach_batch_threadpool(
-               struct   llama_context * ctx,
-            ggml_compute_threadpool_t   threadpool);
+            ggml_compute_threadpool_t   threadpool,
+            ggml_compute_threadpool_t   threadpool_batch);
     LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-    LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
-    LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
-
-    // Pauses all attached threadpools
-    LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);
 
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
diff --git a/src/llama.cpp b/src/llama.cpp
index 916d0f8c13484..57e765ce06355 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15523,39 +15523,6 @@ static void llama_graph_compute(
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 }
 
-// Optionally swaps the batch and single-tok threadpools.
-// Returns the number of threads, and if a valid threadpool exists, returns it too.
-static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
-        llama_context & lctx,
-              int32_t   n_tokens) {
-
-    const auto & cparams = lctx.cparams;
-    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-
-    ggml_compute_threadpool_t threadpool = nullptr;  // nullptr -> disposable threadpool
-
-    // A batch threadpool without a non-batch threadpool isn't supported.
-    GGML_ASSERT(!lctx.threadpool_batch || lctx.threadpool);
-
-    if (lctx.threadpool_batch && lctx.threadpool) {
-        // Switch between the 2 threadpools as needed
-        if (n_tokens > 1) {
-            ggml_pause_threadpool(lctx.threadpool);
-            threadpool = lctx.threadpool_batch;
-            n_threads = cparams.n_threads_batch;
-        } else {
-            ggml_pause_threadpool(lctx.threadpool_batch);
-            threadpool = lctx.threadpool;
-            n_threads = cparams.n_threads;
-        }
-    } else if (lctx.threadpool) {
-        threadpool = lctx.threadpool;
-        n_threads = cparams.n_threads;
-    }
-    return std::make_pair(n_threads, threadpool);
-}
-
-
 // decode a batch of tokens by evaluating the transformer
 //
 //   - lctx:      llama context
@@ -15662,11 +15629,8 @@ static int llama_decode_internal(
             lctx.n_outputs = n_outputs_new;
         }
 
-        std::pair<int32_t, ggml_compute_threadpool_t> threads =
-            llama_swap_threadpools(lctx, n_tokens);
-
-        int n_threads                        = threads.first;
-        ggml_compute_threadpool_t threadpool = threads.second;
+        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
 
         GGML_ASSERT(n_threads > 0);
 
@@ -15906,11 +15870,9 @@ static int llama_encode_internal(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    std::pair<int32_t, ggml_compute_threadpool_t> threads =
-        llama_swap_threadpools(lctx, n_tokens);
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
 
-    int n_threads                        = threads.first;
-    ggml_compute_threadpool_t threadpool = threads.second;
     GGML_ASSERT(n_threads > 0);
 
     ggml_backend_sched_reset(lctx.sched);
@@ -17500,36 +17462,15 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
 
 void llama_attach_threadpool(
              struct llama_context * ctx,
-        ggml_compute_threadpool_t   threadpool) {
-    ctx->threadpool = threadpool;
-}
-
-void llama_attach_batch_threadpool(
-             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool,
         ggml_compute_threadpool_t   threadpool_batch) {
-    ctx->threadpool_batch = threadpool_batch;
+    ctx->threadpool       = threadpool;
+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
 
 void llama_detach_threadpool(struct llama_context * ctx) {
-    ctx->threadpool = nullptr;
-}
-
-void llama_detach_batch_threadpool(struct llama_context * ctx) {
-    ctx->threadpool = nullptr;
-}
-
-void llama_detach_threadpools(struct llama_context * ctx) {
-    llama_detach_threadpool(ctx);
-    llama_detach_batch_threadpool(ctx);
-}
-
-void llama_pause_threadpools(struct llama_context * ctx) {
-    if (ctx->threadpool) {
-        ggml_pause_threadpool(ctx->threadpool);
-    }
-    if (ctx->threadpool_batch) {
-        ggml_pause_threadpool(ctx->threadpool_batch);
-    }
+    ctx->threadpool       = nullptr;
+    ctx->threadpool_batch = nullptr;
 }
 
 void llama_backend_free(void) {

From c6328bc0adc8ee412e51c288457c69aea8a6eca3 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 27 Aug 2024 18:55:59 -0700
Subject: [PATCH 40/48] threadpool: futher api cleanup and prep for future
 refactoring

All threadpool related functions and structs use ggml_threadpool prefix.
---
 examples/llama-bench/llama-bench.cpp |  4 +--
 examples/main/main.cpp               | 10 +++---
 ggml/include/ggml-backend.h          |  2 +-
 ggml/include/ggml.h                  | 18 +++++-----
 ggml/src/ggml-backend.c              |  6 ++--
 ggml/src/ggml.c                      | 54 ++++++++++++++--------------
 include/llama.h                      |  4 +--
 src/llama.cpp                        | 14 ++++----
 8 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index b6b1efe0295a7..ce461333cf473 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) {
         tpp.poll       = t.poll;
         tpp.prio       = params.prio;
 
-        struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
+        struct ggml_threadpool* threadpool = ggml_threadpool_create(&tpp);
         if (!threadpool) {
             LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
             exit(1);
@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
 
         llama_free(ctx);
 
-        ggml_release_threadpool(threadpool);
+        ggml_threadpool_release(threadpool);
     }
 
     llama_free_model(lmodel);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 0ccd0558fc94a..4d8b028010060 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -232,9 +232,9 @@ int main(int argc, char ** argv) {
 
     set_process_priority(params.cpuparams.priority);
 
-    struct ggml_compute_threadpool * threadpool_batch = NULL;
+    struct ggml_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
-        threadpool_batch = ggml_create_threadpool(&tpp_batch);
+        threadpool_batch = ggml_threadpool_create(&tpp_batch);
         if (!threadpool_batch) {
             LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
             exit(1);
@@ -244,7 +244,7 @@ int main(int argc, char ** argv) {
         tpp.paused = true;
     }
 
-    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
+    struct ggml_threadpool * threadpool = ggml_threadpool_create(&tpp);
     if (!threadpool) {
         LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
         exit(1);
@@ -1023,8 +1023,8 @@ int main(int argc, char ** argv) {
     llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
-    ggml_release_threadpool(threadpool);
-    ggml_release_threadpool(threadpool_batch);
+    ggml_threadpool_release(threadpool);
+    ggml_threadpool_release(threadpool_batch);
 
 #ifndef LOG_DISABLE_LOGS
     LOG_TEE("Log end\n");
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index c59f9f54a44b9..e4612108122d0 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -102,7 +102,7 @@ extern "C" {
 
     GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
     GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
     GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
     // Create a backend buffer from an existing pointer
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1ced22eec79d9..8b10e025aa121 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -645,9 +645,9 @@ extern "C" {
         bool                paused;                      // start in paused state
     };
 
-    struct ggml_compute_threadpool;     // forward declaration, see ggml.c
+    struct ggml_threadpool;     // forward declaration, see ggml.c
 
-    typedef struct  ggml_compute_threadpool * ggml_compute_threadpool_t;
+    typedef struct  ggml_threadpool * ggml_threadpool_t;
 
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
@@ -656,7 +656,7 @@ extern "C" {
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
         int n_threads;
-        struct ggml_compute_threadpool * threadpool;
+        struct ggml_threadpool * threadpool;
 
         // abort ggml_graph_compute when true
         ggml_abort_callback abort_callback;
@@ -2039,18 +2039,18 @@ extern "C" {
     GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
     GGML_API void                            ggml_threadpool_params_init(struct ggml_threadpool_params *p, int n_threads);
     GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
-    GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
-    GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
-    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
-    GGML_API void                            ggml_pause_threadpool        (struct ggml_compute_threadpool * threadpool);
-    GGML_API void                            ggml_resume_threadpool       (struct ggml_compute_threadpool * threadpool);
+    GGML_API struct ggml_threadpool*         ggml_threadpool_create       (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_threadpool_release      (struct ggml_threadpool * threadpool);
+    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
     GGML_API struct ggml_cplan ggml_graph_plan(
                   const struct ggml_cgraph * cgraph,
                                        int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-            struct ggml_compute_threadpool * threadpool /* = NULL */ );
+                    struct ggml_threadpool * threadpool /* = NULL */ );
     GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
 
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 03e41a09cb7c9..99ec15a0f4af6 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -723,7 +723,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 
 struct ggml_backend_cpu_context {
     int                       n_threads;
-    ggml_compute_threadpool_t threadpool;
+    ggml_threadpool_t threadpool;
 
     void * work_data;
     size_t work_size;
@@ -906,14 +906,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
-void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
     struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
 
     if (ctx->threadpool && ctx->threadpool != threadpool) {
        // already had a different threadpool, pause/suspend it before switching
-       ggml_pause_threadpool(ctx->threadpool);
+       ggml_threadpool_pause(ctx->threadpool);
     }
     ctx->threadpool = threadpool;
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f05f89a275d4e..c8f6152e55c5e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1955,7 +1955,7 @@ typedef pthread_mutex_t    ggml_mutex_t;
 #endif
 
 // Threadpool def
-struct ggml_compute_threadpool {
+struct ggml_threadpool {
     ggml_mutex_t mutex;       // mutex for cond.var
     ggml_cond_t  cond;        // cond.var for waiting for new work
 
@@ -1990,7 +1990,7 @@ struct ggml_compute_state {
     int  last_graph;
     bool pending;
 #endif
-    struct ggml_compute_threadpool * threadpool;
+    struct ggml_threadpool * threadpool;
     int ith;
 };
 
@@ -2002,7 +2002,7 @@ struct ggml_compute_params {
     size_t wsize;
     void * wdata;
 
-    struct ggml_compute_threadpool * threadpool;
+    struct ggml_threadpool * threadpool;
 };
 
 //
@@ -3110,7 +3110,7 @@ inline static void ggml_critical_section_start(void) {
 }
 
 #ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+static void ggml_barrier(struct ggml_threadpool * threadpool) {
     if (threadpool->n_threads_cur == 1) {
         return;
     }
@@ -3118,7 +3118,7 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
     #pragma omp barrier
 }
 #else
-static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+static void ggml_barrier(struct ggml_threadpool * threadpool) {
     if (threadpool->n_threads_cur == 1) {
         return;
     }
@@ -18837,7 +18837,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
     }
 }
 
-void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
+void ggml_threadpool_release(struct ggml_threadpool* threadpool) {
     if (!threadpool) return;
 
 #ifndef GGML_USE_OPENMP
@@ -18868,24 +18868,24 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
 
 #ifndef GGML_USE_OPENMP
 // pause/resume must be called under mutex
-static void ggml_pause_threadpool_locked(struct ggml_compute_threadpool * threadpool) {
+static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
     GGML_PRINT_DEBUG("Pausing threadpool\n");
     threadpool->pause = true;
     ggml_cond_broadcast(&threadpool->cond);
 }
 
-static void ggml_resume_threadpool_locked(struct ggml_compute_threadpool * threadpool) {
+static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
     GGML_PRINT_DEBUG("Resuming threadpool\n");
     threadpool->pause = false;
     ggml_cond_broadcast(&threadpool->cond);
 }
 #endif
 
-void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     ggml_mutex_lock(&threadpool->mutex);
     if (!threadpool->pause) {
-       ggml_pause_threadpool_locked(threadpool);
+       ggml_threadpool_pause_locked(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -18893,11 +18893,11 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #endif
 }
 
-void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     ggml_mutex_lock(&threadpool->mutex);
     if (threadpool->pause) {
-       ggml_resume_threadpool_locked(threadpool);
+       ggml_threadpool_resume_locked(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -18908,7 +18908,7 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 struct ggml_cplan ggml_graph_plan(
           const struct ggml_cgraph * cgraph,
                            int       n_threads,
-    struct ggml_compute_threadpool * threadpool) {
+    struct ggml_threadpool * threadpool) {
 
     if (threadpool == NULL) {
         GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
@@ -19119,7 +19119,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 #ifndef GGML_USE_OPENMP
 
 static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
-    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    struct ggml_threadpool * threadpool = state->threadpool;
 
     if (state->pending || threadpool->stop || threadpool->pause) { return true; }
 
@@ -19134,7 +19134,7 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
 }
 
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
-    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    struct ggml_threadpool * threadpool = state->threadpool;
 
     // This seems to make 0 ... 100 a decent range for polling level across modern processors.
     // Perhaps, we can adjust it dynamically based on load and things.
@@ -19149,7 +19149,7 @@ static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state *
 }
 
 static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
-    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    struct ggml_threadpool * threadpool = state->threadpool;
 
     if (ggml_graph_compute_poll_for_work(state)) {
         return state->pending;
@@ -19168,7 +19168,7 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
 
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    struct ggml_threadpool * threadpool = state->threadpool;
 
     ggml_thread_apply_priority(threadpool->prio);
     if (ggml_thread_cpumask_is_valid(state->cpumask)) {
@@ -19205,7 +19205,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
 }
 
 // Start processing new graph
-static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpool)
+static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
 {
     // always take the mutex here because the worker threads are doing hybrid poll/wait
 
@@ -19221,7 +19221,7 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo
        }
 
        // resume does cond broadcast
-       ggml_resume_threadpool_locked(threadpool);
+       ggml_threadpool_resume_locked(threadpool);
     } else {
        ggml_cond_broadcast(&threadpool->cond);
     }
@@ -19254,13 +19254,13 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
     return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
 
-static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
+static struct ggml_threadpool * ggml_threadpool_create_impl(
     struct ggml_threadpool_params * tpp,
                struct ggml_cgraph * cgraph,
                 struct ggml_cplan * cplan) {
 
-    struct ggml_compute_threadpool * threadpool =
-        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_threadpool));
+    struct ggml_threadpool * threadpool =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
     {
         threadpool->cgraph           = cgraph;
         threadpool->cplan            = cplan;
@@ -19320,8 +19320,8 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     return threadpool;
 }
 
-struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) {
-    return ggml_create_threadpool_impl(tpp, NULL, NULL);
+struct ggml_threadpool * ggml_threadpool_create(struct ggml_threadpool_params * tpp) {
+    return ggml_threadpool_create_impl(tpp, NULL, NULL);
 }
 
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
@@ -19330,7 +19330,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
 
     int n_threads                               = cplan->n_threads;
-    struct ggml_compute_threadpool * threadpool = cplan->threadpool;
+    struct ggml_threadpool * threadpool = cplan->threadpool;
 
     bool disposable_threadpool = false;
 
@@ -19339,7 +19339,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         disposable_threadpool = true;
 
         struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
-        threadpool = ggml_create_threadpool_impl(&ttp, cgraph, cplan);
+        threadpool = ggml_threadpool_create_impl(&ttp, cgraph, cplan);
     } else {
         // Reset some of the parameters that need resetting
         // No worker threads should be accessing the parameters below at this stage
@@ -19384,7 +19384,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     enum ggml_status ret = threadpool->ec;
 
     if (disposable_threadpool) {
-        ggml_release_threadpool(threadpool);
+        ggml_threadpool_release(threadpool);
     }
 
     return ret;
diff --git a/include/llama.h b/include/llama.h
index c03c4929b4e30..b969689be2aa7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -431,8 +431,8 @@ extern "C" {
     // Optional: an auto threadpool gets created in ggml if not passed explicitly
     LLAMA_API void llama_attach_threadpool(
                struct   llama_context * ctx,
-            ggml_compute_threadpool_t   threadpool,
-            ggml_compute_threadpool_t   threadpool_batch);
+            ggml_threadpool_t   threadpool,
+            ggml_threadpool_t   threadpool_batch);
     LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
 
     // Call once at the end of the program - currently only used for MPI
diff --git a/src/llama.cpp b/src/llama.cpp
index 57e765ce06355..9bf6e22afef76 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3091,8 +3091,8 @@ struct llama_context {
 #endif
     ggml_backend_t backend_cpu = nullptr;
 
-    ggml_compute_threadpool_t threadpool       = nullptr;
-    ggml_compute_threadpool_t threadpool_batch = nullptr;
+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
 
     bool has_evaluated_once = false;
 
@@ -15500,7 +15500,7 @@ static void llama_graph_compute(
                   llama_context & lctx,
                     ggml_cgraph * gf,
                             int   n_threads,
-        ggml_compute_threadpool * threadpool) {
+        ggml_threadpool * threadpool) {
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(lctx.backend_metal)) {
         ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -15630,7 +15630,7 @@ static int llama_decode_internal(
         }
 
         int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-        ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+        ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
 
         GGML_ASSERT(n_threads > 0);
 
@@ -15871,7 +15871,7 @@ static int llama_encode_internal(
     lctx.n_outputs = n_tokens;
 
     int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-    ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+    ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
 
     GGML_ASSERT(n_threads > 0);
 
@@ -17462,8 +17462,8 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
 
 void llama_attach_threadpool(
              struct llama_context * ctx,
-        ggml_compute_threadpool_t   threadpool,
-        ggml_compute_threadpool_t   threadpool_batch) {
+        ggml_threadpool_t   threadpool,
+        ggml_threadpool_t   threadpool_batch) {
     ctx->threadpool       = threadpool;
     ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }

From bead7d47fbe90bd333e5d911beae55b25aba3ec8 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 27 Aug 2024 22:33:03 -0700
Subject: [PATCH 41/48] threadpool: minor indent fixes

---
 ggml/src/ggml-backend.c | 8 ++++----
 src/llama.cpp           | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 99ec15a0f4af6..04c7feb8999bc 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -722,11 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 #endif
 
 struct ggml_backend_cpu_context {
-    int                       n_threads;
-    ggml_threadpool_t threadpool;
+    int                 n_threads;
+    ggml_threadpool_t   threadpool;
 
-    void * work_data;
-    size_t work_size;
+    void *              work_data;
+    size_t              work_size;
 
     ggml_abort_callback abort_callback;
     void *              abort_callback_data;
diff --git a/src/llama.cpp b/src/llama.cpp
index 9bf6e22afef76..fe1942c5d6b26 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15497,9 +15497,9 @@ static void llama_output_reorder(struct llama_context * ctx) {
 }
 
 static void llama_graph_compute(
-                  llama_context & lctx,
-                    ggml_cgraph * gf,
-                            int   n_threads,
+          llama_context & lctx,
+            ggml_cgraph * gf,
+                    int   n_threads,
         ggml_threadpool * threadpool) {
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(lctx.backend_metal)) {

From 8e8f8ce42d2790a8039d7652546abdd5ea1bf50c Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 27 Aug 2024 22:36:01 -0700
Subject: [PATCH 42/48] threadpool: improve setprioty error message

---
 common/common.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9191ade718c1f..9fa18472512ab 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -285,7 +285,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
         return true;
     }
 
-    int32_t p = 0;
+    int p = 0;
     switch (prio) {
         case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
         case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
@@ -294,7 +294,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
     }
 
     if (!setpriority(PRIO_PROCESS, 0, p)) {
-        fprintf(stderr, "warn: failed to set process priority class %d : %s (%d)\n", prio, strerror(errno), errno);
+        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
         return false;
     }
     return true;

From c6c27b140aeea1ca3fa46e2aca85120c7f768d6b Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <max.krasnyansky@gmail.com>
Date: Wed, 28 Aug 2024 20:54:42 -0700
Subject: [PATCH 43/48] Update examples/llama-bench/llama-bench.cpp

Co-authored-by: slaren <slarengh@gmail.com>
---
 examples/llama-bench/llama-bench.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index ce461333cf473..ff092cd590df4 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -307,7 +307,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
     printf("  --prio <0|1|2|3>                    (default: %d)\n", cmd_params_defaults.prio);
-    printf("  --delay <0...N>                     (default: %d)\n", cmd_params_defaults.delay);
+    printf("  --delay <0...N> (seconds)           (default: %d)\n", cmd_params_defaults.delay);
     printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
     printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
     printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");

From b97bd67e2bd4269a0498ca99675e8eeaa76f8388 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 28 Aug 2024 21:04:02 -0700
Subject: [PATCH 44/48] threadpool: fix indent in set_threadpool call

---
 ggml/src/ggml-backend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 04c7feb8999bc..5b877db3566e7 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -912,8 +912,8 @@ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool
     struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
 
     if (ctx->threadpool && ctx->threadpool != threadpool) {
-       // already had a different threadpool, pause/suspend it before switching
-       ggml_threadpool_pause(ctx->threadpool);
+        // already had a different threadpool, pause/suspend it before switching
+        ggml_threadpool_pause(ctx->threadpool);
     }
     ctx->threadpool = threadpool;
 }

From cae35b9fb990b325598ca8b2e946c3857897dc72 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 28 Aug 2024 21:17:11 -0700
Subject: [PATCH 45/48] use int32_t for n_thread type in public llama.cpp API

---
 include/llama.h | 6 +++---
 src/llama.cpp   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index b969689be2aa7..3c7f89fe0568a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -304,8 +304,8 @@ extern "C" {
         uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
         uint32_t n_ubatch;          // physical maximum batch size
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        int      n_threads;         // number of threads to use for generation
-        int      n_threads_batch;   // number of threads to use for batch processing
+        int32_t  n_threads;         // number of threads to use for generation
+        int32_t  n_threads_batch;   // number of threads to use for batch processing
 
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
@@ -844,7 +844,7 @@ extern "C" {
     // Set the number of threads used for decoding
     // n_threads is the number of threads used for generation (single token)
     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch);
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
 
     // Get the number of threads used for generation of a single token.
     LLAMA_API int llama_n_threads(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index fe1942c5d6b26..fb5b76ebffd96 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19389,7 +19389,7 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
     }
 }
 
-void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch) {
+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
     ctx->cparams.n_threads       = n_threads;
     ctx->cparams.n_threads_batch = n_threads_batch;
 }

From c49d6340718be5a2b9d397bd3db21bba98d2c176 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 28 Aug 2024 21:34:16 -0700
Subject: [PATCH 46/48] threadpool: use _new and _free instead of _create and
 _release

---
 examples/llama-bench/llama-bench.cpp |  4 ++--
 examples/main/main.cpp               |  8 ++++----
 ggml/include/ggml.h                  |  6 +++---
 ggml/src/ggml.c                      | 12 ++++++------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index ff092cd590df4..8edadef909f42 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) {
         tpp.poll       = t.poll;
         tpp.prio       = params.prio;
 
-        struct ggml_threadpool* threadpool = ggml_threadpool_create(&tpp);
+        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
         if (!threadpool) {
             LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
             exit(1);
@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
 
         llama_free(ctx);
 
-        ggml_threadpool_release(threadpool);
+        ggml_threadpool_free(threadpool);
     }
 
     llama_free_model(lmodel);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 4d8b028010060..2c05afb048c7b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -234,7 +234,7 @@ int main(int argc, char ** argv) {
 
     struct ggml_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
-        threadpool_batch = ggml_threadpool_create(&tpp_batch);
+        threadpool_batch = ggml_threadpool_new(&tpp_batch);
         if (!threadpool_batch) {
             LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
             exit(1);
@@ -244,7 +244,7 @@ int main(int argc, char ** argv) {
         tpp.paused = true;
     }
 
-    struct ggml_threadpool * threadpool = ggml_threadpool_create(&tpp);
+    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
     if (!threadpool) {
         LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
         exit(1);
@@ -1023,8 +1023,8 @@ int main(int argc, char ** argv) {
     llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
-    ggml_threadpool_release(threadpool);
-    ggml_threadpool_release(threadpool_batch);
+    ggml_threadpool_free(threadpool);
+    ggml_threadpool_free(threadpool_batch);
 
 #ifndef LOG_DISABLE_LOGS
     LOG_TEE("Log end\n");
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 8b10e025aa121..94afc4d9a9313 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2037,10 +2037,10 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
     GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
-    GGML_API void                            ggml_threadpool_params_init(struct ggml_threadpool_params *p, int n_threads);
+    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
     GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
-    GGML_API struct ggml_threadpool*         ggml_threadpool_create       (struct ggml_threadpool_params  * params);
-    GGML_API void                            ggml_threadpool_release      (struct ggml_threadpool * threadpool);
+    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
     GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
     GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
     GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c8f6152e55c5e..45dca68d49535 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18837,7 +18837,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
     }
 }
 
-void ggml_threadpool_release(struct ggml_threadpool* threadpool) {
+void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
     if (!threadpool) return;
 
 #ifndef GGML_USE_OPENMP
@@ -19254,7 +19254,7 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
     return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
 
-static struct ggml_threadpool * ggml_threadpool_create_impl(
+static struct ggml_threadpool * ggml_threadpool_new_impl(
     struct ggml_threadpool_params * tpp,
                struct ggml_cgraph * cgraph,
                 struct ggml_cplan * cplan) {
@@ -19320,8 +19320,8 @@ static struct ggml_threadpool * ggml_threadpool_create_impl(
     return threadpool;
 }
 
-struct ggml_threadpool * ggml_threadpool_create(struct ggml_threadpool_params * tpp) {
-    return ggml_threadpool_create_impl(tpp, NULL, NULL);
+struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
+    return ggml_threadpool_new_impl(tpp, NULL, NULL);
 }
 
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
@@ -19339,7 +19339,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         disposable_threadpool = true;
 
         struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
-        threadpool = ggml_threadpool_create_impl(&ttp, cgraph, cplan);
+        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
     } else {
         // Reset some of the parameters that need resetting
         // No worker threads should be accessing the parameters below at this stage
@@ -19384,7 +19384,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     enum ggml_status ret = threadpool->ec;
 
     if (disposable_threadpool) {
-        ggml_threadpool_release(threadpool);
+        ggml_threadpool_free(threadpool);
     }
 
     return ret;

From 3b5f7c2a9b13a40233b79753de60000ae1c95fa3 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 28 Aug 2024 21:56:53 -0700
Subject: [PATCH 47/48] fix two more public APIs to use int32_t for n_threads

---
 include/llama.h | 4 ++--
 src/llama.cpp   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 3c7f89fe0568a..c3bda9e02bb21 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -847,10 +847,10 @@ extern "C" {
     LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
 
     // Get the number of threads used for generation of a single token.
-    LLAMA_API int llama_n_threads(struct llama_context * ctx);
+    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
 
     // Get the number of threads used for prompt and batch processing (multiple token).
-    LLAMA_API int llama_n_threads_batch(struct llama_context * ctx);
+    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
 
     // Set whether the model is in embeddings mode or not
     // If true, embeddings will be returned but logits will not
diff --git a/src/llama.cpp b/src/llama.cpp
index fb5b76ebffd96..2274296b45406 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19394,11 +19394,11 @@ void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t
     ctx->cparams.n_threads_batch = n_threads_batch;
 }
 
-int llama_n_threads(struct llama_context * ctx) {
+int32_t llama_n_threads(struct llama_context * ctx) {
     return ctx->cparams.n_threads;
 }
 
-int llama_n_threads_batch(struct llama_context * ctx) {
+int32_t llama_n_threads_batch(struct llama_context * ctx) {
     return ctx->cparams.n_threads_batch;
 }
 

From 52aa67723ad0cd8860872405396e1383aa1dda4e Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 28 Aug 2024 22:24:30 -0700
Subject: [PATCH 48/48] build: set _GNU_SOURCE for Adroid

---
 ggml/src/CMakeLists.txt | 2 +-
 ggml/src/ggml.c         | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ff84b9bb5f0f2..ec7d308253b59 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1247,7 +1247,7 @@ endif()
 
 # Data types, macros and functions related to controlling CPU affinity and
 # some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
     add_compile_definitions(_GNU_SOURCE)
 endif()
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 45dca68d49535..fb14471f55518 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -144,10 +144,6 @@ static int sched_yield (void) {
 }
 #else
 
-#ifndef __USE_GNU
-#define __USE_GNU
-#endif
-
 #include <pthread.h>
 #include <stdatomic.h>
 #include <sched.h>