flashinfer-ai
diff --git a/‎csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu‎
Lines changed: 52 additions & 0 deletions b/‎csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu‎
Lines changed: 52 additions & 0 deletions
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cutlass_fused_moe_kernels.cuh"
+#include "moe_kernels.h"
+
+namespace tensorrt_llm::kernels {
+// ==================== Variable batched GEMM specializations ==================================
+template class CutlassMoeFCRunner<float, float>;
+
+#ifdef ENABLE_BF16
+template class CutlassMoeFCRunner<__nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_bfloat16, uint8_t>;
+template class CutlassMoeFCRunner<__nv_bfloat16, cutlass::uint4b_t>;
+#endif
+
+template class CutlassMoeFCRunner<half, half>;
+template class CutlassMoeFCRunner<half, uint8_t>;
+template class CutlassMoeFCRunner<half, cutlass::uint4b_t>;
+#ifdef ENABLE_FP8
+// template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3>;
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, half>;
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>;
+#ifdef ENABLE_BF16
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16>;
+#endif
+#endif
+#ifdef ENABLE_FP4
+template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>;
+template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half, half>;
+#ifdef ENABLE_BF16
+template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16, __nv_bfloat16>;
+#endif
+#endif
+
+};  // namespace tensorrt_llm::kernels