Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions src/plugins/intel_gpu/src/graph/impls/cpu/moe_mask_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,5 @@ attach_moe_mask_gen_reshape_impl::attach_moe_mask_gen_reshape_impl() {
} // namespace cpu
} // namespace cldnn

BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::moe_mask_gen_impl)
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::moe_mask_gen)

BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::moe_mask_gen_reshape_impl)
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::moe_mask_gen_reshape)
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "moe_mask_gen.hpp"

#include "../common_utils/dispatch_utils.hpp"
#include "../common_utils/jitter.hpp"
#include "../primitive_ocl_base.hpp"
#include "../utils/kernel_generator.hpp"
#include "intel_gpu/primitives/moe_mask_gen.hpp"

namespace ov::intel_gpu::ocl {
namespace {

class MoeMaskGenRefGenerator : public KernelGenerator {
public:
MoeMaskGenRefGenerator() : KernelGenerator("moe_mask_gen") {}

protected:
[[nodiscard]] JitConstants get_jit_constants(const RuntimeParams& params) const override {
auto jit = KernelGenerator::get_jit_constants(params);

auto prim = params.typed_desc<moe_mask_gen>();
jit.make("NUM_EXPERTS_PER_TOKEN", prim->num_experts_per_token);

return jit;
}

Arguments get_arguments_desc(const RuntimeParams& params) const override {
Arguments args;
if (params.is_dynamic()) {
args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
}

args.push_back({ArgumentDescriptor::Types::INPUT, 0});

const uint32_t num_of_outputs = 5;
for (uint32_t i = 0; i < num_of_outputs; i++) {
args.push_back({ArgumentDescriptor::Types::OUTPUT, i});
}

return args;
}

[[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override {
return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {
auto& wgs = kd.params.workGroups;
if (!params.is_dynamic()) {
auto num_total_experts = static_cast<size_t>(params.typed_desc<moe_mask_gen>()->num_total_experts);
wgs.global = {num_total_experts, 1, 1};
wgs.local = {num_total_experts, 1, 1};
}
}};
}
};

class MoeMaskGenRefImpl : public PrimitiveImplOCL {
public:
DECLARE_OBJECT_TYPE_SERIALIZATION(ov::intel_gpu::ocl::MoeMaskGenRefImpl)

Stage::Ptr moe_mask_gen = make_stage<MoeMaskGenRefGenerator>();

MoeMaskGenRefImpl() : PrimitiveImplOCL(MoeMaskGenRef::get_type_info_static()) {}
MoeMaskGenRefImpl(const program_node& node, const RuntimeParams& params) : MoeMaskGenRefImpl() {
add_stage(moe_mask_gen, params);
}

[[nodiscard]] std::unique_ptr<primitive_impl> clone() const override {
return make_deep_copy<MoeMaskGenRefImpl>(this);
}
};

} // namespace

std::unique_ptr<primitive_impl> MoeMaskGenRef::create_impl(const program_node& node, const RuntimeParams& params) const {
assert(node.is_type<moe_mask_gen>());
return std::make_unique<MoeMaskGenRefImpl>(node, params);
}

} // namespace ov::intel_gpu::ocl

BIND_BINARY_BUFFER_WITH_TYPE(cldnn::moe_mask_gen)
BIND_BINARY_BUFFER_WITH_TYPE(ov::intel_gpu::ocl::MoeMaskGenRefImpl)
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <utility>

#include "program_node.h"
#include "registry/implementation_manager.hpp"

using namespace cldnn; // TODO: Remove once namespaces are aligned

namespace ov::intel_gpu::ocl {

struct MoeMaskGenRef : public ImplementationManager {
OV_GPU_PRIMITIVE_IMPL("ocl::moe_mask_gen::ref")
explicit MoeMaskGenRef(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, std::move(vf)) {}
[[nodiscard]] std::unique_ptr<primitive_impl> create_impl(const program_node& node, const RuntimeParams& params) const override;
[[nodiscard]] bool validate_impl(const program_node& node) const override {
static constexpr std::array supported_fmts = {
format::bfyx,
};

static constexpr std::array supported_types = {
ov::element::f32,
ov::element::i32,
ov::element::i64,
};

const auto& in0_layout = node.get_input_layout(0);
const auto& out_layout = node.get_output_layout(0);

if (!one_of(in0_layout.format, supported_fmts) || !one_of(out_layout.format, supported_fmts)) {
return false;
}

if (!one_of(in0_layout.data_type, supported_types) || !one_of(out_layout.data_type, supported_types)) {
return false;
}

return true;
}
};

} // namespace ov::intel_gpu::ocl
53 changes: 53 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_mask_gen.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "include/batch_headers/common.cl"

KERNEL(moe_mask_gen)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* topk_idx,
__global OUTPUT_TYPE* tokens_per_expert,
__global OUTPUT1_TYPE* experts_info_start_idx,
__global OUTPUT2_TYPE* experts_id,
__global OUTPUT3_TYPE* tokens_lens_per_expert,
__global OUTPUT4_TYPE* num_actual_used_experts
)
{
const size_t expert_id = get_local_id(0);
int num_tokens = INPUT0_BATCH_NUM;

int num_tokens_per_curr_expert = 0;
for (int i = 0; i < num_tokens * NUM_EXPERTS_PER_TOKEN; ++i) {
if (topk_idx[i] == expert_id) {
num_tokens_per_curr_expert += 1;
}
}
int is_used = (num_tokens_per_curr_expert > 0) ? 1 : 0;

int tokens_per_expert_iter = work_group_scan_exclusive_add(num_tokens_per_curr_expert);
int experts_id_iter = work_group_scan_exclusive_add(is_used);

if ((expert_id + 1) == get_local_size(0)) {
num_actual_used_experts[0] = experts_id_iter + is_used;
}

if (num_tokens_per_curr_expert == 0) {
return;
}

experts_info_start_idx[experts_id_iter] = tokens_per_expert_iter;
experts_id[experts_id_iter] = expert_id;
tokens_lens_per_expert[experts_id_iter] = num_tokens_per_curr_expert;

int token_idx = 0;
for (int t = 0; t < num_tokens; ++t) {
for (int e = 0; e < NUM_EXPERTS_PER_TOKEN; ++e) {
if (topk_idx[token_idx] == expert_id) {
tokens_per_expert[tokens_per_expert_iter] = t;
tokens_per_expert_iter += 1;
}
token_idx += 1;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
#include "intel_gpu/primitives/moe_mask_gen.hpp"
#include "primitive_inst.h"

#if OV_GPU_WITH_OCL
#include "impls/ocl_v2/moe/moe_mask_gen.hpp"
#endif

namespace ov::intel_gpu {

using namespace cldnn;

const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<moe_mask_gen>::get_implementations() {
static const std::vector<std::shared_ptr<ImplementationManager>> impls = {
OV_GPU_GET_INSTANCE_CPU(moe_mask_gen, shape_types::static_shape)
OV_GPU_GET_INSTANCE_CPU(moe_mask_gen, shape_types::dynamic_shape)
OV_GPU_CREATE_INSTANCE_OCL(ocl::MoeMaskGenRef, shape_types::static_shape)
OV_GPU_CREATE_INSTANCE_OCL(ocl::MoeMaskGenRef, shape_types::dynamic_shape)
};

return impls;
Expand Down
Loading