CUDA: add a fused top-K MoE kernel

am17an · am17an · commit 2141b8b810b5 · 2025-09-21T09:59:24.000+08:00
This kernel does the following:
1. softmax over the logits per token [n_experts, n_tokens]
2. argmax reduce over the top-k (n_experts_used) logits
3. write weights + ids to global memory

It is intended as fusion of softmax-&gt;top-k-&gt;get_rows pipeline for MoE models
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -45,6 +45,7 @@
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/mean.cuh"
 #include "ggml-cuda/tsembd.cuh"
+#include "ggml-cuda/topk-moe.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/wkv.cuh"
@@ -2825,6 +2826,40 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
     GGML_ASSERT(unary_ops.size() == num_unary);
 #endif
 
+    //special case for topk-moe
+    if (ops.size() == 5 && ops.begin()[0] == GGML_OP_SOFT_MAX && ops.begin()[1] == GGML_OP_RESHAPE && ops.begin()[2] == GGML_OP_ARGSORT
+        && ops.begin()[3] == GGML_OP_VIEW && ops.begin()[4] == GGML_OP_GET_ROWS) {
+
+        for (int i = 0; i < 5; i++) {
+            if (cgraph->nodes[node_idx + i]->op != ops.begin()[i]) return false;
+        }
+
+        ggml_tensor * softmax = cgraph->nodes[node_idx];
+
+        float scale    = 1.0f;
+        float max_bias = 0.0f;
+
+        memcpy(&scale,    (const float *) softmax->op_params + 0, sizeof(float));
+        memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
+
+        if (scale != 1.0f || max_bias != 0.0f) {
+            return false;
+        }
+
+        // don't fuse when masks or sinks are present
+        if (softmax->src[1] || softmax->src[2]) {
+            return false;
+        }
+
+        const int n_expert = softmax->ne[0];
+        // n_expert must be a power of 2
+        if (n_expert & (n_expert - 1) != 0 || n_expert > 512) {
+            return false;
+        }
+
+        return true;
+    }
+
     if (!ggml_can_fuse(cgraph, node_idx, ops)) {
         return false;
     }
@@ -2892,6 +2927,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         return true;
     }
 
+
+
     return false;
 }
 
@@ -2915,6 +2952,15 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                 if (!disable_fusion) {
 
+                    if (ggml_cuda_can_fuse(cgraph, i, {GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS}, {})) {
+
+                        ggml_tensor * weights = cgraph->nodes[i+4];
+                        ggml_tensor * selected_experts = cgraph->nodes[i+3];
+                        ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts);
+                        i += 4;
+                        continue;
+                    }
+
                     if (node->op == GGML_OP_ADD) {
                         int n_fuse = 0;
                         ggml_op ops[8];
@@ -2964,6 +3010,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
                         continue;
                     }
+
                 }
 #ifndef NDEBUG
                 assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
@@ -0,0 +1,165 @@
+#include "topk-moe.cuh"
+
+/*
+    This kernel does the following:
+    1. softmax over the logits per token [n_experts, n_tokens]
+    2. argmax reduce over the top-k (n_experts_used) logits
+    3. write weights + ids to global memory
+
+    It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
+*/
+template <size_t n_experts>
+__global__ void topk_moe_cuda(const float * logits,
+                                float *       weights,
+                                int32_t *     ids,
+                                const int     n_rows,
+                                const int     n_expert_used) {
+    const int row = blockIdx.x * blockDim.y + threadIdx.y;
+    if (row >= n_rows) {
+        return;
+    }
+    logits += n_experts * row;
+    ids += n_experts * row;
+    weights += n_expert_used * row;
+
+    constexpr int experts_per_thread = (n_experts > 32) ? n_experts / 32 : 1;
+
+    const int start_expert = threadIdx.x * experts_per_thread;
+    const int end_expert   = (threadIdx.x + 1) * experts_per_thread;
+    float     max_val      = -INFINITY;
+
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        const int   expert = start_expert + i;
+        const float val    = (expert < n_experts) ? logits[expert] : -INFINITY;
+        max_val            = max(val, max_val);
+    }
+
+    max_val = warp_reduce_max(max_val);
+
+    float wt[experts_per_thread];
+    float tmp = 0.f;
+
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        const int   expert = start_expert + i;
+        const float val    = (expert < n_experts) ? logits[expert] : -INFINITY;
+        wt[i]              = expf(val - max_val);
+        tmp += wt[i];
+    }
+
+    tmp = warp_reduce_sum(tmp);
+
+    const float inv_sum = 1.0f / tmp;
+
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        wt[i] = wt[i] * inv_sum;
+    }
+
+    //at this point, each thread holds a portion of softmax,
+    //we do the argmax reduce over n_expert_used, each time marking
+    //the expert weight as -inf to exclude from the next iteration
+
+    for (int k = 0; k < n_expert_used; k++) {
+        float max_val    = wt[0];
+        int   max_expert = start_expert;
+
+#pragma unroll
+        for (int i = 1; i < experts_per_thread; i++) {
+            const int expert = start_expert + i;
+            if (wt[i] > max_val) {
+                max_val    = wt[i];
+                max_expert = expert;
+            }
+        }
+
+#pragma unroll
+        for (int mask = warpSize / 2; mask > 0; mask /= 2) {
+            const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, warpSize);
+            const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, warpSize);
+            if (val > max_val) {
+                max_val    = val;
+                max_expert = expert;
+            }
+        }
+
+        if (max_expert >= start_expert && max_expert < end_expert) {
+            wt[max_expert - start_expert] = -INFINITY;
+
+            weights[k] = max_val;
+            ids[k]     = max_expert;
+        }
+    }
+}
+
+static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
+                                    const float *               logits,
+                                    float *                     weights,
+                                    int32_t *                   ids,
+                                    const int                   n_rows,
+                                    const int                   n_expert,
+                                    const int                   n_expert_used) {
+    const int    rows_per_block = 4;
+    dim3         grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
+    dim3         block_dims(32, rows_per_block, 1);
+    cudaStream_t stream = ctx.stream();
+
+    switch (n_expert) {
+        case 1:
+            topk_moe_cuda<1><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 2:
+            topk_moe_cuda<2><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 4:
+            topk_moe_cuda<4><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 8:
+            topk_moe_cuda<8><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 16:
+            topk_moe_cuda<16><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 32:
+            topk_moe_cuda<32><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 64:
+            topk_moe_cuda<64><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 128:
+            topk_moe_cuda<128><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 256:
+            topk_moe_cuda<256><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        case 512:
+            topk_moe_cuda<512><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+            break;
+        default:
+            GGML_ASSERT(false && "fatal error");
+            break;
+    }
+}
+
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
+                             ggml_tensor *               logits,
+                             ggml_tensor *               weights,
+                             ggml_tensor *               ids) {
+    GGML_ASSERT(logits->type == GGML_TYPE_F32);
+    GGML_ASSERT(weights->type == GGML_TYPE_F32);
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+    const float * logits_d  = (const float *) logits->src[0]->data;
+    float *       weights_d = (float *) weights->data;
+    int32_t *     ids_d     = (int32_t *) ids->data;
+
+    const int n_experts = logits->ne[0];
+    const int n_rows    = logits->ne[1];
+
+    cudaStream_t stream = ctx.stream();
+
+    const int n_expert_used = weights->ne[1];
+
+    launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
+}
diff --git a/ggml/src/ggml-cuda/topk-moe.cuh b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, ggml_tensor * logits, ggml_tensor * weights, ggml_tensor * top_k);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -929,6 +929,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
     cb(weights, "ffn_moe_weights", il);
 
+    //call early so that softmax->topk->get_rows can be fused
+    ggml_build_forward_expand(gf, weights);
+
     if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
         weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
         weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4403,6 +4403,42 @@ struct test_argsort : public test_case {
     }
 };
 
+struct test_topk_moe: public test_case {
+
+    const std::array<int64_t, 4> ne;
+    const int n_expert_used;
+    test_topk_moe(std::array<int64_t, 4> ne = {10, 5, 1, 1}, int n_expert_used = 1)
+    : ne(ne),
+    n_expert_used(n_expert_used) {
+        GGML_ASSERT(n_expert_used <= ne[0]);
+    }
+
+    std::string vars() override {
+        return VARS_TO_STR2(ne, n_expert_used);
+    }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "TOPK_GATED_MOE";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int n_expert = ne[0];
+        const int n_tokens = ne[1];
+
+        ggml_tensor * logits = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
+        ggml_tensor * probs  = ggml_soft_max(ctx, logits);
+        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
+
+        ggml_tensor * out = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
 // GGML_OP_SUM
 struct test_sum : public test_case {
     const ggml_type type;
@@ -6570,6 +6606,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
     test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3}));
 
+
+    test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4));
+    test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8));
+    test_cases.emplace_back(new test_topk_moe({128, 19, 1, 1}, 16));
+
 #if 0
     // these tests are disabled to save execution time, sbut they can be handy for debugging
     test_cases.emplace_back(new test_llama(2, true));

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#include "common.cuh"`
	`2`	`+`
	`3`	`+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, ggml_tensor * logits, ggml_tensor * weights, ggml_tensor * top_k);`