Use smem for final write

am17an · am17an · commit e772b28fdf74 · 2025-09-25T11:09:18.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2953,7 +2953,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     }
 
                     if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
-
                         ggml_tensor * weights = cgraph->nodes[i+4];
                         ggml_tensor * selected_experts = cgraph->nodes[i+3];
                         ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false);
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
@@ -74,7 +74,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
     float wt_sum = 0.f;
 
     extern __shared__ float data_topk_shared[];
-    float *                 wt_shared_ptr = data_topk_shared + row * n_expert_used;
+    float * wt_shared_ptr = data_topk_shared + threadIdx.y * n_expert_used;
 
     for (int k = 0; k < n_expert_used; k++) {
         float max_val    = wt[0];
@@ -83,7 +83,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
 #pragma unroll
         for (int i = 1; i < experts_per_thread; i++) {
             const int expert = threadIdx.x + i * WARP_SIZE;
-            if (expert < n_experts && wt[i] > max_val) {
+            if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
                 max_val    = wt[i];
                 max_expert = expert;
             }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -929,8 +929,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
     cb(weights, "ffn_moe_weights", il);
 
-    //call early so that softmax->topk->get_rows can be fused
-    ggml_build_forward_expand(gf, weights);
 
     if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
         weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
@@ -955,6 +953,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         cb(weights, "ffn_moe_weights_scaled", il);
     }
 
+    //call early so that topk-moe can be used
+    ggml_build_forward_expand(gf, weights);
+
     cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
 
     if (weight_before_ffn) {

Original file line number	Diff line number	Diff line change
`@@ -2953,7 +2953,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`2953`	`2953`	`}`
`2954`	`2954`
`2955`	`2955`	`if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/with norm/ false), {})) {`
`2956`		`-`
`2957`	`2956`	`ggml_tensor * weights = cgraph->nodes[i+4];`
`2958`	`2957`	`ggml_tensor * selected_experts = cgraph->nodes[i+3];`
`2959`	`2958`	`ggml_cuda_op_topk_moe(cuda_ctx, node, weights, selected_experts, /with norm*/ false);`