@@ -8538,22 +8538,24 @@ static struct ggml_tensor * llm_build_moe_ffn(
85388538 ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
85398539 cb(gate, "ffn_moe_gate", il);
85408540
8541- switch (type_op) {
8542- case LLM_FFN_SILU:
8543- {
8544- gate = ggml_silu(ctx, gate);
8545- cb(gate, "ffn_moe_silu", il);
8546- } break;
8547- case LLM_FFN_GELU:
8548- {
8549- gate = ggml_gelu(ctx, gate);
8550- cb(gate, "ffn_moe_gelu", il);
8551- } break;
8552- default:
8553- GGML_ABORT("fatal error");
8554- }
8555-
8556- ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
8541+ // This is equivalent to the commented out code below
8542+ ggml_tensor * par = ggml_fused_mul_unary(ctx, gate, up, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
8543+
8544+ //switch (type_op) {
8545+ // case LLM_FFN_SILU:
8546+ // {
8547+ // gate = ggml_silu(ctx, gate);
8548+ // cb(gate, "ffn_moe_silu", il);
8549+ // } break;
8550+ // case LLM_FFN_GELU:
8551+ // {
8552+ // gate = ggml_gelu(ctx, gate);
8553+ // cb(gate, "ffn_moe_gelu", il);
8554+ // } break;
8555+ // default:
8556+ // GGML_ABORT("fatal error");
8557+ //}
8558+ //ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
85578559 cb(par, "ffn_moe_gate_par", il);
85588560
85598561 ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
0 commit comments