openvinotoolkit · akladiev · Dec 2, 2025 · Nov 28, 2025
@@ -187,8 +187,5 @@ attach_moe_mask_gen_reshape_impl::attach_moe_mask_gen_reshape_impl() {
 }  // namespace cpu
 }  // namespace cldnn
 
-BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::moe_mask_gen_impl)
-BIND_BINARY_BUFFER_WITH_TYPE(cldnn::moe_mask_gen)
-
 BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::moe_mask_gen_reshape_impl)
 BIND_BINARY_BUFFER_WITH_TYPE(cldnn::moe_mask_gen_reshape)
@@ -0,0 +1,83 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "moe_mask_gen.hpp"
+
+#include "../common_utils/dispatch_utils.hpp"
+#include "../common_utils/jitter.hpp"
+#include "../primitive_ocl_base.hpp"
+#include "../utils/kernel_generator.hpp"
+#include "intel_gpu/primitives/moe_mask_gen.hpp"
+
+namespace ov::intel_gpu::ocl {
+namespace {
+
+class MoeMaskGenRefGenerator : public KernelGenerator {
+public:
+    MoeMaskGenRefGenerator() : KernelGenerator("moe_mask_gen") {}
+
+protected:
+    [[nodiscard]] JitConstants get_jit_constants(const RuntimeParams& params) const override {
+        auto jit = KernelGenerator::get_jit_constants(params);
+
+        auto prim = params.typed_desc<moe_mask_gen>();
+        jit.make("NUM_EXPERTS_PER_TOKEN", prim->num_experts_per_token);
+
+        return jit;
+    }
+
+    Arguments get_arguments_desc(const RuntimeParams& params) const override {
+        Arguments args;
+        if (params.is_dynamic()) {
+            args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
+        }
+
+        args.push_back({ArgumentDescriptor::Types::INPUT, 0});
+
+        const uint32_t num_of_outputs = 5;
+        for (uint32_t i = 0; i < num_of_outputs; i++) {
+            args.push_back({ArgumentDescriptor::Types::OUTPUT, i});
+        }
+
+        return args;
+    }
+
+    [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override {
+        return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {
+            auto& wgs = kd.params.workGroups;
+            if (!params.is_dynamic()) {
+                auto num_total_experts = static_cast<size_t>(params.typed_desc<moe_mask_gen>()->num_total_experts);
+                wgs.global = {num_total_experts, 1, 1};
+                wgs.local = {num_total_experts, 1, 1};
+            }
+        }};
+    }
+};
+
+class MoeMaskGenRefImpl : public PrimitiveImplOCL {
+public:
+    DECLARE_OBJECT_TYPE_SERIALIZATION(ov::intel_gpu::ocl::MoeMaskGenRefImpl)
+
+    Stage::Ptr moe_mask_gen = make_stage<MoeMaskGenRefGenerator>();
+
+    MoeMaskGenRefImpl() : PrimitiveImplOCL(MoeMaskGenRef::get_type_info_static()) {}
+    MoeMaskGenRefImpl(const program_node& node, const RuntimeParams& params) : MoeMaskGenRefImpl() {
+        add_stage(moe_mask_gen, params);
+    }
+
+    [[nodiscard]] std::unique_ptr<primitive_impl> clone() const override {
+        return make_deep_copy<MoeMaskGenRefImpl>(this);
+    }
+};
+
+}  // namespace
+
+std::unique_ptr<primitive_impl> MoeMaskGenRef::create_impl(const program_node& node, const RuntimeParams& params) const {
+    assert(node.is_type<moe_mask_gen>());
+    return std::make_unique<MoeMaskGenRefImpl>(node, params);
+}
+
+}  // namespace ov::intel_gpu::ocl
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::moe_mask_gen)
+BIND_BINARY_BUFFER_WITH_TYPE(ov::intel_gpu::ocl::MoeMaskGenRefImpl)
@@ -0,0 +1,47 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "program_node.h"
+#include "registry/implementation_manager.hpp"
+
+using namespace cldnn;  // TODO: Remove once namespaces are aligned
+
+namespace ov::intel_gpu::ocl {
+
+struct MoeMaskGenRef : public ImplementationManager {
+    OV_GPU_PRIMITIVE_IMPL("ocl::moe_mask_gen::ref")
+    explicit MoeMaskGenRef(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, std::move(vf)) {}
+    [[nodiscard]] std::unique_ptr<primitive_impl> create_impl(const program_node& node, const RuntimeParams& params) const override;
+    [[nodiscard]] bool validate_impl(const program_node& node) const override {
+        static constexpr std::array supported_fmts = {
+            format::bfyx,
+        };
+
+        static constexpr std::array supported_types = {
+            ov::element::f32,
+            ov::element::i32,
+            ov::element::i64,
+        };
+
+        const auto& in0_layout = node.get_input_layout(0);
+        const auto& out_layout = node.get_output_layout(0);
+
+        if (!one_of(in0_layout.format, supported_fmts) || !one_of(out_layout.format, supported_fmts)) {
+            return false;
+        }
+
+        if (!one_of(in0_layout.data_type, supported_types) || !one_of(out_layout.data_type, supported_types)) {
+            return false;
+        }
+
+        return true;
+    }
+};
+
+}  // namespace ov::intel_gpu::ocl
@@ -0,0 +1,53 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/common.cl"
+
+KERNEL(moe_mask_gen)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* topk_idx,
+    __global OUTPUT_TYPE* tokens_per_expert,
+    __global OUTPUT1_TYPE* experts_info_start_idx,
+    __global OUTPUT2_TYPE* experts_id,
+    __global OUTPUT3_TYPE* tokens_lens_per_expert,
+    __global OUTPUT4_TYPE* num_actual_used_experts
+)
+{
+    const size_t expert_id = get_local_id(0);
+    int num_tokens = INPUT0_BATCH_NUM;
+
+    int num_tokens_per_curr_expert = 0;
+    for (int i = 0; i < num_tokens * NUM_EXPERTS_PER_TOKEN; ++i) {
+        if (topk_idx[i] == expert_id) {
+            num_tokens_per_curr_expert += 1;
+        }
+    }
+    int is_used = (num_tokens_per_curr_expert > 0) ? 1 : 0;
+
+    int tokens_per_expert_iter = work_group_scan_exclusive_add(num_tokens_per_curr_expert);
+    int experts_id_iter = work_group_scan_exclusive_add(is_used);
+
+    if ((expert_id + 1) == get_local_size(0)) {
+        num_actual_used_experts[0] = experts_id_iter + is_used;
+    }
+
+    if (num_tokens_per_curr_expert == 0) {
+        return;
+    }
+
+    experts_info_start_idx[experts_id_iter] = tokens_per_expert_iter;
+    experts_id[experts_id_iter] = expert_id;
+    tokens_lens_per_expert[experts_id_iter] = num_tokens_per_curr_expert;
+
+    int token_idx = 0;
+    for (int t = 0; t < num_tokens; ++t) {
+        for (int e = 0; e < NUM_EXPERTS_PER_TOKEN; ++e) {
+            if (topk_idx[token_idx] == expert_id) {
+                tokens_per_expert[tokens_per_expert_iter] = t;
+                tokens_per_expert_iter += 1;
+            }
+            token_idx += 1;
+        }
+    }
+}
@@ -6,14 +6,18 @@
 #include "intel_gpu/primitives/moe_mask_gen.hpp"
 #include "primitive_inst.h"
 
+#if OV_GPU_WITH_OCL
+    #include "impls/ocl_v2/moe/moe_mask_gen.hpp"
+#endif
+
 namespace ov::intel_gpu {
 
 using namespace cldnn;
 
 const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<moe_mask_gen>::get_implementations() {
     static const std::vector<std::shared_ptr<ImplementationManager>> impls = {
-        OV_GPU_GET_INSTANCE_CPU(moe_mask_gen, shape_types::static_shape)
-        OV_GPU_GET_INSTANCE_CPU(moe_mask_gen, shape_types::dynamic_shape)
+        OV_GPU_CREATE_INSTANCE_OCL(ocl::MoeMaskGenRef, shape_types::static_shape)
+        OV_GPU_CREATE_INSTANCE_OCL(ocl::MoeMaskGenRef, shape_types::dynamic_shape)
     };
 
     return impls;