flashinfer-ai
diff --git a/‎csrc/trtllm_moe_allreduce_fusion.cu‎
Lines changed: 51 additions & 0 deletions b/‎csrc/trtllm_moe_allreduce_fusion.cu‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎flashinfer/comm.py‎
Lines changed: 69 additions & 0 deletions b/‎flashinfer/comm.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎include/flashinfer/comm/trtllm_allreduce.cuh‎
Lines changed: 1 addition & 0 deletions b/‎include/flashinfer/comm/trtllm_allreduce.cuh‎
Lines changed: 1 addition & 0 deletions
@@ -81,6 +81,57 @@ void trtllm_moe_allreduce_fusion(
       });
 }
 
+void trtllm_moe_finalize_allreduce_fusion(
+    at::Tensor const& allreduce_in, at::Tensor const& residual_in, at::Tensor const& norm_weight,
+    at::Tensor const& expanded_idx_to_permuted_idx, at::Tensor& norm_out, at::Tensor& residual_out,
+    bool launch_with_pdl, at::Tensor& workspace, int64_t const world_rank, int64_t const world_size,
+    double const eps, std::optional<at::Tensor> const& shared_expert_output,
+    std::optional<at::Tensor> const& expert_scale_factor) {
+  DISPATCH_FLOATING_TYPES_FOR_ALLREDUCE(residual_in.scalar_type(), c_type, [&] {
+    MoeFinalizeAllReduceFusionParams<c_type> params;
+
+    int hidden_dim = residual_in.size(-1);
+    int top_k = expanded_idx_to_permuted_idx.size(-1);
+
+    params.quant_out = nullptr;
+    params.scale_out = nullptr;
+
+    params.nranks = static_cast<int>(world_size);
+    params.rank = static_cast<int>(world_rank);
+    // size: num_token * hidden_dim
+    params.size = residual_in.numel();
+    params.hidden_dim = hidden_dim;
+
+    // workspace: AR scratch space
+    params.workspace = reinterpret_cast<void**>(workspace.mutable_data_ptr());
+    params.rms_gamma = norm_weight.data_ptr();
+    params.rms_eps = static_cast<float>(eps);
+    params.residual_in = residual_in.data_ptr();
+    params.stream = at::cuda::getCurrentCUDAStream(norm_weight.get_device());
+
+    // MOE Reduction specific params
+    params.top_k = top_k;
+    params.allreduce_in = allreduce_in.data_ptr();
+    params.expert_scale_factor =
+        expert_scale_factor.has_value() ? expert_scale_factor.value().data_ptr() : nullptr;
+    TORCH_CHECK(expanded_idx_to_permuted_idx.scalar_type() == at::ScalarType::Int,
+                "expanded_idx_to_permuted_idx must be int32");
+    params.expanded_idx_to_permuted_idx =
+        static_cast<int32_t*>(expanded_idx_to_permuted_idx.data_ptr());
+    params.shared_expert_output =
+        shared_expert_output.has_value() ? shared_expert_output.value().data_ptr() : nullptr;
+
+    // output tensors
+    params.norm_out = norm_out.mutable_data_ptr();
+    params.residual_out = residual_out.mutable_data_ptr();
+
+    auto status = moefinalize_allreduce_fusion_op(params, launch_with_pdl);
+    TORCH_CHECK(status == cudaSuccess, "moefinalize_allreduce_fusion_op failed with error code ",
+                cudaGetErrorString(status));
+  });
+}
+
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("trtllm_moe_allreduce_fusion", &trtllm_moe_allreduce_fusion);
+  m.def("trtllm_moe_finalize_allreduce_fusion", &trtllm_moe_finalize_allreduce_fusion);
 }
@@ -66,6 +66,7 @@ class AllReduceFusionOp:
     RESIDUAL_RMS_NORM_OUT_QUANT_FP8 = 6
     RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4 = 7
     MOE_ALLREDUCE_RESIDUAL_RMS_NORM = 8
+    MOE_FINALIZE_ALLREDUCE_RESIDUAL_RMS_NORM = 9
 
 
 class AllReduceFusionPattern:
@@ -599,12 +600,48 @@ def trtllm_moe_allreduce_fusion(
             scale_out,
         )
 
+    @register_custom_op(
+        "flashinfer::trtllm_moe_finalize_allreduce_fusion",
+        mutates_args=["residual_out", "norm_out"],
+    )
+    def trtllm_moe_finalize_allreduce_fusion(
+        allreduce_in: torch.Tensor,
+        residual_in: torch.Tensor,
+        norm_weight: torch.Tensor,
+        expanded_idx_to_permuted_idx: torch.Tensor,
+        norm_out: torch.Tensor,
+        residual_out: torch.Tensor,
+        launch_with_pdl: bool,
+        workspace: torch.Tensor,
+        world_rank: int,
+        world_size: int,
+        eps: float,
+        shared_expert_output: Optional[torch.Tensor],
+        expert_scale_factor: Optional[torch.Tensor],
+    ) -> None:
+        module.trtllm_moe_finalize_allreduce_fusion(
+            allreduce_in,
+            residual_in,
+            norm_weight,
+            expanded_idx_to_permuted_idx,
+            norm_out,
+            residual_out,
+            launch_with_pdl,
+            workspace,
+            world_rank,
+            world_size,
+            eps,
+            shared_expert_output,
+            expert_scale_factor,
+        )
+
     return SimpleNamespace(
         trtllm_lamport_initialize=trtllm_lamport_initialize,
         trtllm_lamport_initialize_all=trtllm_lamport_initialize_all,
         trtllm_custom_all_reduce=trtllm_custom_all_reduce,
         trtllm_allreduce_fusion=trtllm_allreduce_fusion,
         trtllm_moe_allreduce_fusion=trtllm_moe_allreduce_fusion,
+        trtllm_moe_finalize_allreduce_fusion=trtllm_moe_finalize_allreduce_fusion,
     )
 
 
@@ -1088,3 +1125,35 @@ def trtllm_moe_allreduce_fusion(
         quant_out=quant_out,
         scale_out=scale_out,
     )
+
+
+def trtllm_moe_finalize_allreduce_fusion(
+    allreduce_in: torch.Tensor,
+    residual_in: torch.Tensor,
+    norm_weight: torch.Tensor,
+    expanded_idx_to_permuted_idx: torch.Tensor,
+    norm_out: torch.Tensor,
+    residual_out: torch.Tensor,
+    workspace_ptrs: torch.Tensor,
+    launch_with_pdl: bool,
+    world_rank: int,
+    world_size: int,
+    eps: float,
+    shared_expert_output: Optional[torch.Tensor],
+    expert_scale_factor: Optional[torch.Tensor],
+) -> None:
+    get_trtllm_comm_module().trtllm_moe_finalize_allreduce_fusion(
+        allreduce_in=allreduce_in,
+        residual_in=residual_in,
+        norm_weight=norm_weight,
+        expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
+        norm_out=norm_out,
+        residual_out=residual_out,
+        workspace=workspace_ptrs,
+        launch_with_pdl=launch_with_pdl,
+        world_rank=world_rank,
+        world_size=world_size,
+        eps=eps,
+        shared_expert_output=shared_expert_output,
+        expert_scale_factor=expert_scale_factor,
+    )
@@ -74,6 +74,7 @@ enum class AllReduceFusionOp : int8_t {
   RESIDUAL_RMS_NORM_OUT_QUANT_FP8 = 6,
   RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4 = 7,
   MOE_ALLREDUCE_RESIDUAL_RMS_NORM = 8,
+  MOE_FINALIZE_ALLREDUCE_RESIDUAL_RMS_NORM = 9,
 };
 
 template <typename T>