Use smem for final write

am17an · am17an · commit 4b2d2b9f88b0 · 2025-09-25T10:26:06.000+08:00
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
@@ -74,7 +74,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
     float wt_sum = 0.f;
 
     extern __shared__ float data_topk_shared[];
-    float *                 wt_shared_ptr = data_topk_shared + row * n_expert_used;
+    float * wt_shared_ptr = data_topk_shared + threadIdx.y * n_expert_used;
 
     for (int k = 0; k < n_expert_used; k++) {
         float max_val    = wt[0];