From 99ad9de883a0a23ce8184115ef6288d25b96ad35 Mon Sep 17 00:00:00 2001 From: River Date: Thu, 20 Nov 2025 14:41:45 +0800 Subject: [PATCH 01/20] [GPU][MoE] Optimize moe_3gemm primitive implement with micro_gemm kernel --- .../graph/impls/ocl_v2/moe/moe_3gemm_base.hpp | 41 ++ .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 350 ++++++++++++++ .../impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp | 83 ++++ .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 447 +++++++++++++++++- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.hpp | 26 +- .../impls/ocl_v2/moe_3gemm_swiglu_fuse.cl | 33 +- .../src/graph/impls/ocl_v2/moe_gemm.cl | 4 + .../impls/ocl_v2/moe_scatter_reduction_opt.cl | 7 + .../convert_moe_to_compressed.cpp | 1 + 9 files changed, 951 insertions(+), 41 deletions(-) create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp new file mode 100644 index 00000000000000..ea9de9677947ec --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "moe_gemm_base.hpp" + +// #define ENABLE_ONEDNN_FOR_GPU + +namespace ov::intel_gpu::ocl { + +// mlp_gate: 0 +// mlp_up: 1 +// mlp_down: 2 + +enum class MoE3GemmMicroKernelType : uint8_t { MLP_GATE = 0, MLP_UP = 1, MLP_DOWN = 2 }; + +enum class MOE3GemmInputIndex : uint8_t { + HIDDEN_STATES = 0, + ROUTING_WEIGHTS = 1, + WEIGHT_0 = 2, + SCALE_0 = 3, + ZP_0 = 4, + WEIGHT_1 = 5, + SCALE_1 = 6, + ZP_1 = 7, + WEIGHT_2 = 8, + SCALE_2 = 9, + ZP_2 = 10 +}; + +struct moe_3gemm_config { + int32_t weight_group_size = -1; + bool has_batch_dim = false; // 0 - pa, 1 - non-pa +}; + +} // namespace ov::intel_gpu::ocl \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp new file mode 100644 index 00000000000000..252612601ec4e1 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -0,0 +1,350 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef ENABLE_ONEDNN_FOR_GPU +// clang-format off +// Put this file at first to avoid incorrect header files includes order. +// For example, intel_gpu/runtime/utils.hpp will causes compiling error in hash +#include "moe_3gemm_gen_micro.hpp" + +#include "intel_gpu/graph/kernel_impl_params.hpp" +// #include "intel_gpu/primitives/moe_gemm.hpp" +#include "ocl_v2/utils/jitter.hpp" +// #include "moe_gemm_inst.h" +#include "../utils/kernel_generator.hpp" + +// clang-format on +namespace ov::intel_gpu::ocl { + +static size_t get_subgroup_size(gpu_arch arch) { + switch (arch) { + case gpu_arch::gen9: + case gpu_arch::gen11: + case gpu_arch::xe_lp: + case gpu_arch::xe_hp: + case gpu_arch::xe_hpg: + return 8; + case gpu_arch::xe_hpc: + case gpu_arch::xe2: + case gpu_arch::xe3: + return 16; + default: + return 0; + } +} + +JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& params, const micro::Package& moe_gemm, const moe_3gemm_config& cfg) const { + const auto& device_info = params.get_device_info(); + auto jit = make_base_jit_constants(params); + jit.make("SUBGROUP_SIZE", get_subgroup_size(device_info.arch)); + jit.make("OUTPUT_TYPE", to_ocl_type(data_types::f16)); // output + jit.make("INPUT0_TYPE", to_ocl_type(data_types::f16)); // input: f16 + jit.make("INPUT1_TYPE", to_ocl_type(data_types::u8)); // weight: u4 + jit.make("INPUT2_TYPE", to_ocl_type(data_types::i32)); // experts_ids: i32 + jit.make("INPUT3_TYPE", to_ocl_type(data_types::i32)); // input_offset_per_expert: i32 + jit.make("INPUT4_TYPE", to_ocl_type(data_types::i32)); // n_array: i32 + jit.make("WEIGHT_SCALE_DT", to_ocl_type(data_types::f16)); // scale + jit.make("WEIGHT_ZP_DT", to_ocl_type(data_types::u4)); // zp + jit.make("WEIGHT_COMPRESSED_INT4", 1); + jit.make("IS_GENERATE", 0); // prefill + if (cfg.weight_group_size > 0) + jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[2]); + else + jit.make("NUM_GROUPS", 1); + + const auto& weight_shape = params.input_layouts[m_wei_idx].get_shape(); + // u4:bfyx:4x3072x8x128:nopad + size_t expert_stride = weight_shape.size() == 4 ? (weight_shape[1] * weight_shape[2] * weight_shape[3]) : (weight_shape[1] * weight_shape[2]); + jit.make("EXPERT_STRIDE", expert_stride / 2); + + const auto& input_shape = params.input_layouts[0].get_shape(); + jit.make("INPUT_SEQ_LEN", input_shape[0]); + // f16:bfyx:[?,2048]:nopad + jit.make("INPUT_STRIDE", input_shape.size() == 3 ? input_shape[1] * input_shape[2] : input_shape[1]); + + const auto& output_shape = params.output_layouts[0].get_shape(); + jit.make("OUTPUT_STRIDE", output_shape.size() == 3 ? output_shape[1] * output_shape[2] : output_shape[1]); + + jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); + + auto slm_size = moe_gemm.getSetting("slm_size"); + if (slm_size > 0) + jit.make("USE_SLM", 1); + return jit; +} + +static micro::Type convert_type(ov::element::Type t) { + switch (t) { + case ov::element::f32: + return micro::Type::f32; + case ov::element::f16: + return micro::Type::f16; + case ov::element::i8: + return micro::Type::s8; + case ov::element::u8: + return micro::Type::u8; + case ov::element::i32: + return micro::Type::s32; + case ov::element::u4: + return micro::Type::u4; + case ov::element::i4: + return micro::Type::s4; + default: + break; + } + OPENVINO_THROW("Unsupported element type: ", t); +} + +std::mutex MoE3GemmMicroGenerator::mtx; +void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, micro::Package& gemm_moe, MoE3GemmMicroKernelType type) noexcept { + // TODO: Remove once micro API is thread safe + std::lock_guard l(mtx); + // auto moe_cfg = get_moe_cfg(params); + const auto& device_info = params.get_device_info(); + micro::HWInformation hw_info; + hw_info.euCount = device_info.execution_units_count; + hw_info.gmdid = device_info.ip_version; + hw_info.systolicAvailable = device_info.supports_immad; + + int wei_idx, scale_idx, zp_idx; + switch (type) { + case MoE3GemmMicroKernelType::MLP_GATE: + wei_idx = static_cast(MOE3GemmInputIndex::WEIGHT_0); + scale_idx = static_cast(MOE3GemmInputIndex::SCALE_0); + zp_idx = static_cast(MOE3GemmInputIndex::ZP_0); + break; + case MoE3GemmMicroKernelType::MLP_UP: + wei_idx = static_cast(MOE3GemmInputIndex::WEIGHT_1); + scale_idx = static_cast(MOE3GemmInputIndex::SCALE_1); + zp_idx = static_cast(MOE3GemmInputIndex::ZP_1); + break; + case MoE3GemmMicroKernelType::MLP_DOWN: + wei_idx = static_cast(MOE3GemmInputIndex::WEIGHT_2); + scale_idx = static_cast(MOE3GemmInputIndex::SCALE_2); + zp_idx = static_cast(MOE3GemmInputIndex::ZP_2); + break; + default: + OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); + break; + } + + const auto& weight_shape = params.get_input_layout(wei_idx).get_shape(); + const bool is_prefill = true; + size_t m = weight_shape[1]; + size_t n = is_prefill ? 32 : 8; + size_t k = weight_shape.size() == 4 ? weight_shape[2] * weight_shape[3] : weight_shape[2]; + GPU_DEBUG_TRACE_DETAIL << "init_microkernels for " << (is_prefill ? "prefill" : "generate") << " : Seq_len:" << n << " Ofm:" << m << " K:" << k << "\n"; + + size_t group_size = weight_shape.size() == 4 ? weight_shape[3] : weight_shape[2]; + GPU_DEBUG_TRACE_DETAIL << "weight group size: " << group_size << "\n"; + + micro::GEMMProblem problem_moe; + micro::GEMMProtocol::Options opts_moe; + opts_moe.slmPtr = true; + enum class MICRO_DIMENSIONALITY { NONE = -1, SCALAR = 0, VECTOR = 1, MATRIX = 2 }; + + const bool is_weight_quantized = true; + if (is_weight_quantized) { + problem_moe.Ta = micro::Type::f16; + problem_moe.Ta_ext = convert_type(params.get_input_layout(wei_idx).data_type); + problem_moe.A.setAlignment(micro::alignment_for_ld(k * problem_moe.Ta_ext)); + + problem_moe.Ta_scale = convert_type(params.get_input_layout(scale_idx).data_type); // zp dt + problem_moe.A_scale.setAlignment(2); + problem_moe.A_scale.layout = micro::MatrixLayout::T; + problem_moe.asPtrDims = static_cast(MICRO_DIMENSIONALITY::MATRIX); + + problem_moe.aqGroupM = 1; + problem_moe.aqGroupK = group_size; + + opts_moe.scaleA = true; + const bool is_weight_symmetric_quantized = false; + if (!is_weight_symmetric_quantized) { + const auto& zp_layout = params.get_input_layout(zp_idx); + const auto zp_dt = convert_type(zp_layout.data_type); + problem_moe.Tao = zp_dt; + problem_moe.AO.setAlignment(zp_dt == gemmstone::Type::u4 ? 1 : static_cast(zp_dt.size())); + problem_moe.AO.layout = micro::MatrixLayout::T; + problem_moe.aoPtrDims = static_cast(MICRO_DIMENSIONALITY::MATRIX); + // Calculate A/B row/column sums in kernel. + problem_moe.aOffset = micro::ABOffset::Calc; + opts_moe.offsetA = true; + } + } + + problem_moe.Tb = problem_moe.Tb_ext = micro::Type::f16; + problem_moe.Tc = micro::Type::f32; + problem_moe.Tc_ext = micro::Type::f32; + problem_moe.Ts = problem_moe.Tc; + problem_moe.A.layout = micro::MatrixLayout::T; + problem_moe.B.layout = micro::MatrixLayout::N; + problem_moe.C.layout = micro::MatrixLayout::N; + problem_moe.B.setAlignment(micro::alignment_for_ld(k * problem_moe.Tb)); + problem_moe.C.setAlignment(static_cast(problem_moe.Tc.size())); + + /* Set up problem_moe size information */ + micro::SizeParams sizes; + sizes.n = static_cast(n); + sizes.m = static_cast(m); + sizes.k = static_cast(k); + sizes.batch = static_cast(1); + + GPU_DEBUG_TRACE_DETAIL << "problem_moe:" << problem_moe.toString() << "\n"; + GPU_DEBUG_TRACE_DETAIL << "sizes to select gemm : m : " << m << " n : " << n << " k : " << k << std::endl; + try { + /* Ask microkernel provider for microkernel */ + gemm_moe = micro::select_gemm_microkernel(opts_moe, hw_info, sizes, problem_moe); + } catch (const std::runtime_error& ex) { + OPENVINO_THROW("Can't create moe micro kernel: ", ex.what()); + } +} +DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { + return DispatchDataFunc{[this](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + assert(!params.is_dynamic()); + + auto* rtp = static_cast(rt_params); + const auto& device_info = params.get_device_info(); + const auto& gemm_p = kd.micro_kernels[0]->p; + auto sg_per_wg_n = static_cast(gemm_p.getSetting("sg_per_wg_n")); + auto sg_per_wg_m = static_cast(gemm_p.getSetting("sg_per_wg_m")); + auto sg_tile_m = gemm_p.getSetting("sg_tile_m"); + auto sg_tile_n = gemm_p.getSetting("sg_tile_n"); + + auto& wgs = kd.params.workGroups; + auto& scalars = kd.params.scalars; + scalars.clear(); + scalars.reserve(3); + + auto input_layout = params.get_input_layout(0); + auto experts_weight_layout = params.get_input_layout(m_wei_idx); + + // has_batch_dim indicates whether the input tensor has batch dimension + size_t n = input_layout.get_shape().size() == 3 ? input_layout.get_shape()[1] : input_layout.get_shape()[0]; + const auto& experts_weight_shape = experts_weight_layout.get_shape(); + size_t m = experts_weight_shape[1]; + size_t k = experts_weight_shape.size() == 4 ? experts_weight_shape[2] * experts_weight_shape[3] : experts_weight_shape[2]; + wgs.local = {sg_per_wg_m * get_subgroup_size(device_info.arch), sg_per_wg_n, 1}; + wgs.global = {align_to(ceil_div(m, sg_tile_m), sg_per_wg_m) * get_subgroup_size(device_info.arch), + align_to(ceil_div(n, sg_tile_n), sg_per_wg_n), + static_cast(rtp->num_actually_used_experts)}; + ScalarDescriptor s_m{ScalarDescriptor::Types::INT32}; + s_m.v.s32 = static_cast(m); + scalars.push_back(s_m); + ScalarDescriptor s_k{ScalarDescriptor::Types::INT32}; + s_k.v.s32 = static_cast(k); + scalars.push_back(s_k); + }}; +} + +std::string MoE3GemmMicroGenerator::get_build_options(const kernel_impl_params& params) const { + auto base_options = KernelGenerator::get_build_options(params); + std::string extra_options = " -Dcl_intel_dot_accumulate"; + extra_options += " -Dcl_intel_global_float_atomic"; + extra_options += " -Dcl_intel_subgroup_matrix_multiply_accumulate"; + extra_options += " -Dcl_intel_subgroup_split_matrix_multiply_accumulate"; + return base_options + extra_options; +} + +Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& params) const { + Arguments args; + if (params.is_dynamic()) + args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); + // auto cfg = get_moe_cfg(params); + + switch (m_type) { + case MoE3GemmMicroKernelType::MLP_GATE: + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_0)}); + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array + args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m + args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_0)}); // zp + break; + case MoE3GemmMicroKernelType::MLP_UP: + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_1)}); + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array + args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m + args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_1)}); // zp + break; + case MoE3GemmMicroKernelType::MLP_DOWN: + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // intermediate_mem[6] + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_2)}); + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array + args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m + args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_2)}); // zp + break; + default: + OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); + break; + } + + return args; +} + +KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& params) const { + micro::Package moe_gemm; + const auto& device_info = params.get_device_info(); + try { + init_microkernels(params, moe_gemm, m_type); + } catch (const std::runtime_error& ex) { + OPENVINO_THROW("MoE3GemmMicroGenerator::get_kernel_data() - can't init microkernels: ", ex.what()); + } + + auto jit = get_jit_constants(params, moe_gemm, get_moe_3gemm_cfg(params)); + + KernelData kd; + kd.code = std::make_shared(); + kd.code->language = kernel_language::OCLC_V2; + kd.code->entry_point = get_entry_point(params); + kd.code->jit = ""; + kd.code->undefs = ""; + kd.code->options = get_build_options(params); + kd.code->batch_compilation = false; + kd.code->has_microkernels = true; + kd.code->str = build_code(get_kernel_name(), jit, kd.code->entry_point); + + kd.params.arguments = get_arguments_desc(params); + + kd.update_dispatch_data_func = get_dispatch_data_func(); + + kd.need_args_update = true; + kd.need_dispatch_data_update = true; + + /* Generate microkernel shims */ + micro::ShimOptions shim_options; + shim_options.subgroupSize = static_cast(get_subgroup_size(device_info.arch)); + shim_options.useTileOps = true; + shim_options.decorator = "moe"; + + kd.code->jit += generateShim(moe_gemm, micro::HostLanguage::OpenCL_C, shim_options); + if (moe_gemm.grfMin > 128) { + kd.code->options += " -cl-intel-256-GRF-per-thread"; + } + + kd.micro_kernels.push_back(std::make_shared(moe_gemm)); + + // Micro kernel is using slm implicitly inside the kernel. + // Therefore the slm should be allocated. + uint32_t slm_size = kd.micro_kernels[0]->p.getSetting("slm_size"); + kd.params.local_memory_args.clear(); + if (slm_size > 0) { + kd.params.local_memory_args.push_back(slm_size); + kd.params.arguments.push_back({ArgumentDescriptor::Types::LOCAL_MEMORY_SIZE, slm_size}); + } + return kd; +} +} // namespace ov::intel_gpu::ocl +#endif diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp new file mode 100644 index 00000000000000..4e0acffd8f1ade --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "../utils/kernel_generator.hpp" +#include "common_utils/jitter.hpp" +#include "intel_gpu/graph/kernel_impl_params.hpp" +#include "moe_3gemm_base.hpp" +// #include "intel_gpu/primitives/moe_gemm.hpp" +#include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp" +#include "micro_utils.hpp" +#include "moe_gemm_gen_opt.hpp" +#include "moe_gemm_inst.h" +#include "ocl_v2/utils/jitter.hpp" +using namespace cldnn; // TODO: Remove once namespaces are aligned +namespace ov::intel_gpu::ocl { +#ifdef ENABLE_ONEDNN_FOR_GPU +# include "micro_utils.hpp" + +class MoE3GemmMicroGenerator : public MoEGemmOptGeneratorBase { +public: + explicit MoE3GemmMicroGenerator(MoE3GemmMicroKernelType type) + : MoEGemmOptGeneratorBase("moe_3gemm_prefill_mlp", + type == MoE3GemmMicroKernelType::MLP_GATE ? "_gate" + : type == MoE3GemmMicroKernelType::MLP_UP ? "_up" + : "_down"), + m_type(type) { + switch (m_type) { + case MoE3GemmMicroKernelType::MLP_GATE: + m_wei_idx = static_cast(MOE3GemmInputIndex::WEIGHT_0); + m_scale_idx = static_cast(MOE3GemmInputIndex::SCALE_0); + m_zp_idx = static_cast(MOE3GemmInputIndex::ZP_0); + break; + case MoE3GemmMicroKernelType::MLP_UP: + m_wei_idx = static_cast(MOE3GemmInputIndex::WEIGHT_1); + m_scale_idx = static_cast(MOE3GemmInputIndex::SCALE_1); + m_zp_idx = static_cast(MOE3GemmInputIndex::ZP_1); + break; + case MoE3GemmMicroKernelType::MLP_DOWN: + m_wei_idx = static_cast(MOE3GemmInputIndex::WEIGHT_2); + m_scale_idx = static_cast(MOE3GemmInputIndex::SCALE_2); + m_zp_idx = static_cast(MOE3GemmInputIndex::ZP_2); + break; + default: + OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); + break; + } + } + + [[nodiscard]] std::string get_build_options(const kernel_impl_params& params) const override; + + [[nodiscard]] KernelData get_kernel_data(const kernel_impl_params& params) const override; + + [[nodiscard]] JitConstants get_jit_constants(const kernel_impl_params& params) const override { + OPENVINO_THROW("Use overloaded version instead"); + } + [[nodiscard]] JitConstants get_jit_constants(const kernel_impl_params& params, const micro::Package& moe_gemm, const moe_3gemm_config& cfg) const; + + [[nodiscard]] Arguments get_arguments_desc(const kernel_impl_params& params) const override; + + [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override; + + static const moe_3gemm_config get_moe_3gemm_cfg(const kernel_impl_params& params) { + moe_3gemm_config cfg; + auto desc = params.typed_desc(); + cfg.weight_group_size = desc->_config.group_size; + cfg.has_batch_dim = desc->_config.has_batch_dim; + return cfg; + } + + static void init_microkernels(const kernel_impl_params& params, micro::Package& gemm_moe, MoE3GemmMicroKernelType type) noexcept; + MoE3GemmMicroKernelType m_type; + int m_wei_idx; + int m_scale_idx; + int m_zp_idx; + static std::mutex mtx; +}; +#endif +} // namespace ov::intel_gpu::ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 3330bf54b608af..2a161a469a1eb2 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -4,6 +4,8 @@ #include "moe_3gemm_swiglu_opt.hpp" +#include "moe_3gemm_gen_micro.hpp" + #ifdef ENABLE_ONEDNN_FOR_GPU # include # include @@ -23,6 +25,7 @@ # include "intel_gpu/runtime/stream.hpp" # include "intel_gpu/runtime/utils.hpp" # include "moe_3gemm_fused_inst.h" +# include "moe_3gemm_gen_micro.hpp" # include "ocl_v2/utils/fused_ops_jitter.hpp" # include "ocl_v2/utils/jitter.hpp" # include "primitive_inst.h" @@ -391,6 +394,159 @@ class MoE3GemmSwigluGather : public KernelGenerator { } }; +static size_t GetBlockSize(const RuntimeParams& params) { + const auto& input = params.get_input_layout(0); + size_t vec_size = 1; + switch (input.data_type) { + case ov::element::i8: + case ov::element::u8: + vec_size = 16; + break; + case ov::element::f16: + vec_size = 8; + break; + case ov::element::f32: + case ov::element::i32: + vec_size = 4; + break; + case ov::element::i64: + vec_size = 2; + break; + default: + vec_size = 1; + break; + } + return vec_size; +} + +static auto calc_thread_count(RuntimeParams& params, const size_t vector_size, const size_t hidden_size) { + auto max_wgs = params.get_program().get_engine().get_device_info().max_work_group_size; + const uint64_t threads_needed = (hidden_size + vector_size - 1) / vector_size; + size_t local_threads_needed = std::min(threads_needed, max_wgs); + size_t batches_per_thread = 1; + size_t unaligned_elements = 0; + + if (threads_needed <= max_wgs) { + batches_per_thread = 1; + unaligned_elements = hidden_size % vector_size; + } else { + batches_per_thread = (threads_needed + max_wgs - 1) / max_wgs; + auto new_block_size = batches_per_thread * vector_size; + unaligned_elements = hidden_size % new_block_size; + + local_threads_needed = hidden_size / new_block_size; + auto partialblock = (hidden_size % new_block_size != 0) ? 1 : 0; + local_threads_needed += partialblock; + } + + return std::tuple{local_threads_needed, batches_per_thread, unaligned_elements}; +} +class MoE3GemmSwigluPrefillGather : public KernelGenerator { +public: + MoE3GemmSwigluPrefillGather() : KernelGenerator("moe_gather_ref", "prefill_gather") {} + +protected: + [[nodiscard]] JitConstants get_jit_constants(const RuntimeParams& params) const override { + auto jit = KernelGenerator::get_jit_constants(params); + auto desc = params.typed_desc(); + // auto& engine = params.prog->get_engine(); + // const auto& info = engine.get_device_info(); + + auto hidden_size = desc->_config.hidden_size; + auto block_size = GetBlockSize(params); + auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(params), block_size, hidden_size); + + jit.make("HIDDEN_SIZE", hidden_size); + jit.make("VEC_BLK_SIZE", block_size); + jit.make("BATCHES_PER_THREAD", batches_per_thread); + jit.make("UNALIGNED_ELEMENTS", unaligned_elements); + return jit; + } + + [[nodiscard]] Arguments get_arguments_desc(const RuntimeParams& params) const override { + Arguments args; + + return args; + } + + [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { + return DispatchDataFunc{nullptr}; + } +}; + +class MoE3GemmSwigluPrefillSwiglu : public KernelGenerator { +public: + MoE3GemmSwigluPrefillSwiglu() : KernelGenerator("moe_3gemm_swiglu_fuse", "prefill_swiglu") {} + +protected: + [[nodiscard]] JitConstants get_jit_constants(const RuntimeParams& params) const override { + auto jit = KernelGenerator::get_jit_constants(params); + auto desc = params.typed_desc(); + auto& engine = params.prog->get_engine(); + const auto& info = engine.get_device_info(); + + jit.make("PREFILL_SWIGLU_ENABLE", 1); + jit.make("SUBGROUP_SIZE", info.arch >= gpu_arch::xe2 ? 32 : 16); + jit.make("HIDDEN_SIZE", desc->_config.hidden_size); + jit.make("MOE_DTYPE", "half"); + return jit; + } + + [[nodiscard]] Arguments get_arguments_desc(const RuntimeParams& params) const override { + Arguments args; + + return args; + } + + [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { + return DispatchDataFunc{nullptr}; + } +}; + +class MoE3GemmSwigluPrefillScatterReduce : public KernelGenerator { +public: + MoE3GemmSwigluPrefillScatterReduce() : KernelGenerator("moe_scatter_reduction_opt", "moe_scatter_reduction_ref") {} + +protected: + [[nodiscard]] JitConstants get_jit_constants(const RuntimeParams& params) const override { + auto jit = KernelGenerator::get_jit_constants(params); + auto desc = params.typed_desc(); + // auto& engine = params.prog->get_engine(); + // const auto& info = engine.get_device_info(); + + auto hidden_size = desc->_config.hidden_size; + auto block_size = 4; + auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(params), block_size, hidden_size); + + jit.make("ACTIVE_EXPERTS", desc->_config.top_k); + jit.make("HIDDEN_SIZE", hidden_size); + jit.make("VEC_BLK_SIZE", 4); + jit.make("BATCHES_PER_THREAD", batches_per_thread); + jit.make("SET_ACTUAL_USED_EXPERTS_NUM", 1); + + jit.make("INPUT0_TYPE", "half"); + jit.make("INPUT1_TYPE", "int"); + jit.make("INPUT2_TYPE", "int"); + jit.make("INPUT3_TYPE", "int"); + jit.make("INPUT4_TYPE", "int"); + jit.make("INPUT5_TYPE", "int"); + jit.make("INPUT6_TYPE", "int"); + jit.make("OUTPUT_TYPE", "half"); + + return jit; + } + + [[nodiscard]] Arguments get_arguments_desc(const RuntimeParams& params) const override { + Arguments args; + + return args; + } + + [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { + return DispatchDataFunc{nullptr}; + } +}; + class MoE3GemmSwigluScatter : public KernelGenerator { public: MoE3GemmSwigluScatter() : KernelGenerator("moe_3gemm_swiglu_fuse", "index_add") {} @@ -530,6 +686,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { Stage::Ptr mlp_down = make_stage(); Stage::Ptr mlp_reduce = make_stage(); + Stage::Ptr prefill_gather = make_stage(); + Stage::Ptr micro_gemm_gate = make_stage(MoE3GemmMicroKernelType::MLP_GATE); + Stage::Ptr micro_gemm_up = make_stage(MoE3GemmMicroKernelType::MLP_UP); + Stage::Ptr micro_gemm_down = make_stage(MoE3GemmMicroKernelType::MLP_DOWN); + Stage::Ptr prefill_swiglu = make_stage(); + Stage::Ptr prefill_scatter_reduce = make_stage(); + struct dnnl_weights { dnnl::memory weight; dnnl::memory scale; @@ -589,6 +752,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { int _gate_up_group_size; int _down_group_size; + bool use_micro_gemm_prefill = true; + moe_3gemm_swiglu_opt_impl() : PrimitiveImplOCL(moe_3gemm_swiglu_opt::get_type_info_static()) {} moe_3gemm_swiglu_opt_impl(const program_node& node, const RuntimeParams& params) : moe_3gemm_swiglu_opt_impl() { init(node.as().get_primitive()); @@ -599,6 +764,21 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(mlp_gate_up, params); add_stage(mlp_down, params); add_stage(mlp_reduce, params); + + auto use_micro_gemm_prefill_str = std::getenv("MOE_USE_MICRO_GEMM_PREFILL"); + if (use_micro_gemm_prefill_str) + use_micro_gemm_prefill = std::stoi(use_micro_gemm_prefill_str); + else + use_micro_gemm_prefill = true; + + if (use_micro_gemm_prefill) { + add_stage(prefill_gather, params); + add_stage(micro_gemm_gate, params); + add_stage(micro_gemm_up, params); + add_stage(micro_gemm_down, params); + add_stage(prefill_swiglu, params); + add_stage(prefill_scatter_reduce, params); + } } void init(const std::shared_ptr& cur_moe) { @@ -692,7 +872,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { internal_buffers.emplace_back(layout_topk_id, true); // 0: topk_id internal_buffers.emplace_back(layout_topk_weights, true); // 1: topk_weights // fast single batch: scratch.up = up(x) * silu(gate(x)); scratch.y = down(scratch.up) * weight[expert_no] - auto max_batch = (batch == 1 ? max_topk : batch); + // To support micro_gemm, prefill need to allocate max_topk * batch for input data of micro_gemm + auto max_batch = max_topk * batch; layout layout_gateup_out(ov::PartialShape{max_batch, static_cast(config.inter_size)}, data_type, cldnn::format::bfyx); layout layout_down_out(ov::PartialShape{max_batch, static_cast(config.hidden_size)}, data_type, cldnn::format::bfyx); internal_buffers.emplace_back(layout_gateup_out, true); // 2: up @@ -710,6 +891,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { internal_buffers.emplace_back(index_layout, true); // 7: batch internal_buffers.emplace_back(index_layout, true); // 8: topk + // for micro_gemm + layout layout_micro_gemm(ov::PartialShape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); + internal_buffers.emplace_back(layout_micro_gemm, true); // 9: experts_ids for each activated expert + internal_buffers.emplace_back(layout_micro_gemm, true); // 10: token start offset idx (input gather tokens) for each activated expert + internal_buffers.emplace_back(layout_micro_gemm, true); // 11: token len (input gather tokens) for each activated expert + layout layout_token_idx(ov::PartialShape{batch * max_topk}, ov::element::i32, cldnn::format::bfyx); + internal_buffers.emplace_back(layout_token_idx, true); // 12: token idx per expert return internal_buffers; } @@ -735,19 +923,19 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } // gate - scratch.moe_fusion_wei_addr.weight[0] = instance.input_memory_ptr(static_cast(MOEInputIndex::WEIGHT_0)); - scratch.moe_fusion_wei_addr.scale[0] = instance.input_memory_ptr(static_cast(MOEInputIndex::SCALE_0)); - scratch.moe_fusion_wei_addr.zp[0] = instance.input_memory_ptr(static_cast(MOEInputIndex::ZP_0)); + scratch.moe_fusion_wei_addr.weight[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_0)); + scratch.moe_fusion_wei_addr.scale[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_0)); + scratch.moe_fusion_wei_addr.zp[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_0)); // up - scratch.moe_fusion_wei_addr.weight[1] = instance.input_memory_ptr(static_cast(MOEInputIndex::WEIGHT_1)); - scratch.moe_fusion_wei_addr.scale[1] = instance.input_memory_ptr(static_cast(MOEInputIndex::SCALE_1)); - scratch.moe_fusion_wei_addr.zp[1] = instance.input_memory_ptr(static_cast(MOEInputIndex::ZP_1)); + scratch.moe_fusion_wei_addr.weight[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_1)); + scratch.moe_fusion_wei_addr.scale[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)); + scratch.moe_fusion_wei_addr.zp[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)); // down - scratch.moe_fusion_wei_addr.weight[2] = instance.input_memory_ptr(static_cast(MOEInputIndex::WEIGHT_2)); - scratch.moe_fusion_wei_addr.scale[2] = instance.input_memory_ptr(static_cast(MOEInputIndex::SCALE_2)); - scratch.moe_fusion_wei_addr.zp[2] = instance.input_memory_ptr(static_cast(MOEInputIndex::ZP_2)); + scratch.moe_fusion_wei_addr.weight[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_2)); + scratch.moe_fusion_wei_addr.scale[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_2)); + scratch.moe_fusion_wei_addr.zp[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_2)); } void get_expert_mask_from_gpu(const MOE3GemmFusedCompressed::Config& config, memory::ptr mem, stream& stream, expert_mask_cpu& expert_mask) { @@ -825,7 +1013,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::vector outputs, const std::vector& global, const std::vector& local, - bool needs_completion_event = false) const { + bool needs_completion_event = false, + std::vector scalar_inputs = {}) const { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("moe_3gemm_swiglu_opt_impl::execute_stage")); cldnn::stream& stream = instance.get_network().get_stream(); cldnn::kernel_arguments_data args; @@ -835,6 +1024,17 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { args.inputs.push_back(inputs[i]); } + cldnn::scalars_desc scalar_desc; + if (!scalar_inputs.empty()) { + scalar_desc.resize(scalar_inputs.size()); + for (uint32_t i = 0; i < scalar_inputs.size(); i++) { + desc.arguments.push_back({ArgumentDescriptor::Types::SCALAR, i}); + scalar_desc[i].t = ScalarDescriptor::Types::INT32; + scalar_desc[i].v.s32 = scalar_inputs[i]; + } + args.scalars = &scalar_desc; + } + for (uint32_t i = 0; i < outputs.size(); i++) { desc.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, i}); args.outputs.push_back(outputs[i]); @@ -862,7 +1062,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto final_hidden_states_mem_ptr = instance.output_memory_ptr(0); auto batch_mem_ptr = scratch.topk_id; - auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOEInputIndex::HIDDEN_STATES)); + auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOE3GemmInputIndex::HIDDEN_STATES)); auto routing_mem_ptr = scratch.topk_weights; _hidden_size = static_cast(cur_moe->_config.hidden_size); @@ -920,6 +1120,211 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { return ret; } + cldnn::event::ptr exec_prefill_opt(const std::vector& events, + typed_primitive_inst& instance, + scratch_buffers& scratch, + expert_mask_cpu& expert_mask_cpu) { + auto cur_moe = instance.get_typed_desc(); + int max_topk = static_cast(cur_moe->_config.top_k); + + auto final_hidden_states_mem_ptr = instance.output_memory_ptr(0); + auto batch_mem_ptr = scratch.topk_id; + auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOE3GemmInputIndex::HIDDEN_STATES)); + auto routing_mem_ptr = scratch.topk_weights; + + _hidden_size = static_cast(cur_moe->_config.hidden_size); + _intermediate_size = static_cast(cur_moe->_config.inter_size); + + const size_t subgroup_size = instance.get_impl_params()->get_device_info().arch >= gpu_arch::xe2 ? 32 : 16; + // const size_t max_work_group_size = instance.get_impl_params()->get_device_info().max_work_group_size; + + event::ptr ret_event; + const auto& intermediates_memories = instance.get_intermediates_memories(); + auto& stream = instance.get_network().get_stream(); + auto num_total_experts = static_cast(cur_moe->_config.num_expert); + int num_actually_used_experts = 0; + + // step 1: generate 4 mask data for following kernel execution + // input: topk output, [token_len, expert_topk] + // output: + // mask 0: token idx per expert, static shape = [token_num * topK_num] = [expert_num, ?] + // mask 1: token start offset idx (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] + // mask 2: token len (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] + // mask 3: expert id, dynamic shape = [activated_expert_num] + { + cldnn::mem_lock tokens_per_expert_lock(intermediates_memories[12], stream); + cldnn::mem_lock experts_info_start_idx_lock(intermediates_memories[10], stream); + cldnn::mem_lock experts_id_lock(intermediates_memories[9], stream); + cldnn::mem_lock tokens_lens_per_expert_lock(intermediates_memories[11], stream); + + int tokens_per_expert_iter = 0; + int experts_id_iter = 0; + + for (int expert_idx = 0; expert_idx < num_total_experts; expert_idx++) { + if (!expert_mask_cpu.batch[expert_idx].empty()) { + experts_info_start_idx_lock[experts_id_iter] = tokens_per_expert_iter; + experts_id_lock[experts_id_iter] = expert_idx; + tokens_lens_per_expert_lock[experts_id_iter++] = static_cast(expert_mask_cpu.batch[expert_idx].size()); + num_actually_used_experts++; + for (auto t : expert_mask_cpu.batch[expert_idx]) { + tokens_per_expert_lock[tokens_per_expert_iter++] = t; + } + } + } + + // debug print + { + std::cout << "step 1: prefill_mask num_actually_used_experts=" << num_actually_used_experts << std::endl; + std::cout << "expert_id[" << num_actually_used_experts << "]: = " << std::endl; + for (int i = 0; i < num_actually_used_experts; i++) { + std::cout << experts_id_lock[i] << ", " << std::endl; + } + std::cout << std::endl; + std::cout << "experts_info_start_idx[" << num_actually_used_experts << "]: = " << std::endl; + for (int i = 0; i < num_actually_used_experts; i++) { + std::cout << experts_info_start_idx_lock[i] << ", " << std::endl; + } + std::cout << std::endl; + std::cout << "tokens_len_per_expert[" << num_actually_used_experts << "]: = " << std::endl; + for (int i = 0; i < num_actually_used_experts; i++) { + std::cout << tokens_lens_per_expert_lock[i] << ", " << std::endl; + } + std::cout << std::endl; + std::cout << "tokens_per_expert[" << num_actually_used_experts << "]:" << std::endl; + int token_idx = 0; + for (int i = 0; i < num_actually_used_experts; i++) { + std::cout << "\texpert[" << i << "]: = " << std::endl; + for (int j = 0; j < tokens_lens_per_expert_lock[i]; j++) { + std::cout << tokens_per_expert_lock[token_idx + j] << ", " << std::endl; + } + token_idx += tokens_lens_per_expert_lock[i]; + std::cout << std::endl; + } + std::cout << std::endl; + } + } + + // step 2: generate gather input tokens + // input + // 0: input tensor, shape = [token_len, hidden_size] + // 1: token idx per expert, static shape = [expert_num * topK_num] + // output + // 0: gathered token: shape = [token_len * expert_topK, hidden_size] + { + auto hidden_size = _hidden_size; + auto block_size = GetBlockSize(*instance.get_impl_params()); + auto [local_threads_count, batches_per_thread, unaligned_elements] = + calc_thread_count(const_cast(*instance.get_impl_params()), block_size, hidden_size); + auto token_per_expert = 1; + + std::cout << "step 2: prefill_gather local_threads_count=" << local_threads_count << ", batches_per_thread=" << batches_per_thread + << ", unaligned_elements=" << unaligned_elements << ", token_per_expert=" << token_per_expert << std::endl; + ret_event = execute_stage(events, + instance, + *prefill_gather, + {instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), intermediates_memories[12]}, + {scratch.x}, + {static_cast(token_per_expert * local_threads_count), 1, 1}, + {static_cast(local_threads_count), 1, 1}); + } + + // step 3: moe_gemm for up and gate + // input + // 0: gathered token, shape = [token_len * expert_topK, hidden_size] + // 1: moe weights + // 2: expert id, dynamic shape = [activated_expert_num] + // 3: token start offset idx (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] + // 4: token len (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] + // 5: m = itermedia_size + // 6: k = hidden_size + // 7: wei_scale + // 8: wei_zp + // output: + // 0: up/gate output, shape = [token_len * expert_topK, hidden_size] + { + ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_up); + ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_gate); + } + + // step 4: post proc - gate_up = silu(gate)*up, silu(x)=x*sigmod(x)=x*(1+exp(-x)) + // input + // 0: up [token_len * expert_topK, hidden_size] + // 1: gate [token_len * expert_topK, hidden_size] + // output + // 0: gate_up [token_len * expert_topK, hidden_size] + { + auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); + auto token_size = input_shape[0] * max_topk; + + std::cout << "step 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _hidden_size << std::endl; + + ret_event = execute_stage({ret_event}, + instance, + *prefill_swiglu, + {intermediates_memories[2], intermediates_memories[6]}, + {intermediates_memories[6]}, + {static_cast(token_size), static_cast(_hidden_size), 1}, + {1, subgroup_size, 1}); + } + + // step 5: moe_gemm for down + // input + // 0: gate_up, shape = [token_len * expert_topK, hidden_size] + // 1: moe weights + // 2: expert id, dynamic shape = [activated_expert_num] + // 3: token start offset idx (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] + // 4: token len (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] + // 5: m = itermedia_size + // 6: k = hidden_size + // 7: wei_scale + // 8: wei_zp + // output: + // 0: down output, shape = [token_len * expert_topK, hidden_size] + + { + ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_down); + } + + // step 6: scatter and reduce + // input: + // 0: down output, shape = [token_len * expert_topK, hidden_size] + // 1: experts_per_token, shape = [token_len, expert_topK] + // 2: expert_weights, shape = [expert_num] + // 3: tokens_per_expert, shape = [expert_num, ?] = [token_len * expert_topK] + // 4: experts_start_offset, shape = [activated_expert_num] + // 5: tokens_len_per_expert,dynamic shape = [activated_expert_num] + // 6: expert id, dynamic shape = [activated_expert_num] + // output: + // 0: final hidden states, shape = [token_len, hidden_size] + + { + auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); + auto token_size = input_shape[0] * max_topk; + auto [local_threads_count, batches_per_thread, _] = calc_thread_count(const_cast(*instance.get_impl_params()), 4, _hidden_size); + + std::cout << "step 6: prefill_scatter_reduce token_size=" << token_size << ", local_threads_count=" << local_threads_count + << ", num_actually_used_experts = " << num_actually_used_experts << std::endl; + + ret_event = execute_stage({ret_event}, + instance, + *prefill_scatter_reduce, + {intermediates_memories[3], + batch_mem_ptr, + routing_mem_ptr, + intermediates_memories[12], + intermediates_memories[10], + intermediates_memories[11], + intermediates_memories[9]}, + {final_hidden_states_mem_ptr}, + {static_cast(token_size * local_threads_count), 1, 1}, + {local_threads_count, 1, 1}, + instance.needs_completion_event(), + {num_actually_used_experts}); + } + + return ret_event; + } + struct onednn_kernel { onednn_linear up; onednn_linear gate; @@ -944,13 +1349,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto& cur_net = instance.get_network(); auto& stream = cur_net.get_stream(); auto& dnn_stream = stream.get_onednn_stream(); - auto hidden_states_layout_dt = convert_data_type(instance.input_memory_ptr(static_cast(MOEInputIndex::HIDDEN_STATES))->get_layout().data_type); + auto hidden_states_layout_dt = + convert_data_type(instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().data_type); auto& dnnl_weights = _dnnl_weights[expert_no]; auto kernel = std::make_shared(); // gate - auto gate_weight_layout_dt = convert_data_type(instance.input_memory_ptr(static_cast(MOEInputIndex::WEIGHT_0))->get_layout().data_type); + auto gate_weight_layout_dt = convert_data_type(instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_0))->get_layout().data_type); kernel->gate = onednn_linear::create(dnn_stream.get_engine(), hidden_states_layout_dt, gate_weight_layout_dt, @@ -964,7 +1370,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { dnnl_weights[0].zp); // up - auto up_weight_layout_dt = convert_data_type(instance.input_memory_ptr(static_cast(MOEInputIndex::WEIGHT_1))->get_layout().data_type); + auto up_weight_layout_dt = convert_data_type(instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_1))->get_layout().data_type); kernel->up = onednn_linear::create(dnn_stream.get_engine(), hidden_states_layout_dt, up_weight_layout_dt, @@ -978,7 +1384,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { dnnl_weights[1].zp); // down - auto down_weight_layout_dt = convert_data_type(instance.input_memory_ptr(static_cast(MOEInputIndex::WEIGHT_2))->get_layout().data_type); + auto down_weight_layout_dt = convert_data_type(instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_2))->get_layout().data_type); kernel->down = onednn_linear::create(dnn_stream.get_engine(), hidden_states_layout_dt, down_weight_layout_dt, @@ -1018,7 +1424,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto& cur_net = instance.get_network(); auto& stream = cur_net.get_stream(); - auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOEInputIndex::HIDDEN_STATES)); + auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOE3GemmInputIndex::HIDDEN_STATES)); auto batch = static_cast(hidden_states_layout.get_shape()[0]); scratch_buffers scratch; @@ -1029,7 +1435,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto topk_event = execute_stage(events, instance, *softmax_topk, - {instance.input_memory_ptr(static_cast(MOEInputIndex::ROUTING_WEIGHTS))}, + {instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ROUTING_WEIGHTS))}, {scratch.topk_id, scratch.topk_weights}, {static_cast(batch), lws_size}, {1, lws_size}); @@ -1057,6 +1463,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { expert_mask_cpu expert_mask; get_expert_mask_from_gpu(config, topk_id_mem, stream, expert_mask); + if (use_micro_gemm_prefill) { + std::cout << "Use micro_gemm prefill path" << std::endl; + return exec_prefill_opt({topk_event}, instance, scratch, expert_mask); + } + auto& dnn_stream = stream.get_onednn_stream(); cldnn::event::ptr result_event = nullptr; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.hpp index 8b3fa3d8c9c548..7a3315fd5c9edc 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.hpp @@ -9,29 +9,13 @@ #include "intel_gpu/primitives/activation.hpp" #include "intel_gpu/primitives/eltwise.hpp" +#include "moe_3gemm_base.hpp" #include "program_node.h" #include "registry/implementation_manager.hpp" using namespace cldnn; // TODO: Remove once namespaces are aligned namespace ov::intel_gpu::ocl { -// mlp_gate: 0 -// mlp_up: 1 -// mlp_down: 2 -enum class MOEInputIndex : uint8_t { - HIDDEN_STATES = 0, - ROUTING_WEIGHTS = 1, - WEIGHT_0 = 2, - SCALE_0 = 3, - ZP_0 = 4, - WEIGHT_1 = 5, - SCALE_1 = 6, - ZP_1 = 7, - WEIGHT_2 = 8, - SCALE_2 = 9, - ZP_2 = 10 -}; - struct moe_3gemm_swiglu_opt : public ImplementationManager { OV_GPU_PRIMITIVE_IMPL("ocl::moe::moe_3gemm_swiglu_opt") explicit moe_3gemm_swiglu_opt(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, std::move(vf)) {} @@ -46,7 +30,7 @@ struct moe_3gemm_swiglu_opt : public ImplementationManager { ov::element::f16, }; - const auto& in0_layout = node.get_input_layout(static_cast(MOEInputIndex::HIDDEN_STATES)); + const auto& in0_layout = node.get_input_layout(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)); const auto& out_layout = node.get_output_layout(0); if (!one_of(in0_layout.format, supported_fmts) || !one_of(out_layout.format, supported_fmts)) { return false; @@ -60,7 +44,7 @@ struct moe_3gemm_swiglu_opt : public ImplementationManager { static constexpr std::array supported_wei_type = { ov::element::u4, }; - const auto& wei_layout = node.get_input_layout(static_cast(MOEInputIndex::WEIGHT_0)); + const auto& wei_layout = node.get_input_layout(static_cast(MOE3GemmInputIndex::WEIGHT_0)); if (!one_of(wei_layout.data_type, supported_wei_type)) { return false; } @@ -69,7 +53,7 @@ struct moe_3gemm_swiglu_opt : public ImplementationManager { static constexpr std::array supported_scale_type = { ov::element::f16, }; - const auto& scale_layout = node.get_input_layout(static_cast(MOEInputIndex::SCALE_0)); + const auto& scale_layout = node.get_input_layout(static_cast(MOE3GemmInputIndex::SCALE_0)); if (!one_of(scale_layout.data_type, supported_scale_type)) { return false; } @@ -78,7 +62,7 @@ struct moe_3gemm_swiglu_opt : public ImplementationManager { static constexpr std::array supported_zp_type = { ov::element::u4, }; - const auto& zp_layout = node.get_input_layout(static_cast(MOEInputIndex::ZP_0)); + const auto& zp_layout = node.get_input_layout(static_cast(MOE3GemmInputIndex::ZP_0)); if (!one_of(zp_layout.data_type, supported_zp_type)) { return false; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl index b321525ec1b7ff..f64d05ff0264bd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl @@ -70,8 +70,8 @@ __attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) KERNEL (gather_2d_ref)( const __global MOE_DTYPE* src_tok, // input tokens [total_token, hidden_size] - hidden_states_mem_ptr const __global MOE_DTYPE* src_rweight, // topk_weights [total_token, topk_experts] - __global int * tok_index, // token index [expert_idx][] = [actual_token_num] - expert_mask_mem.batch - __global int * top_index, // topk index [expert_idx][] = [actual_token_num] - expert_mask_mem.topk + __global int * tok_index, // token index [expert_idx][] = [actual_token_num] - expert_mask_mem.batch + __global int * top_index, // topk index [expert_idx][] = [actual_token_num] - expert_mask_mem.topk __global MOE_DTYPE* dst_tok, // output tokens [batch_size, hidden_size] - scratch.x __global MOE_DTYPE* dst_rweight) { // output topk_weights [batch_size] - scratch.routing_weights @@ -129,4 +129,33 @@ KERNEL (index_add_)(const __global MOE_DTYPE* src_tok, dst_tok[off] += src_tok[off]; #endif } + +#elif PREFILL_SWIGLU_ENABLE + +#define SWISH_BETA 1.0f +__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) +KERNEL(swiglu_ref) ( + const __global MOE_DTYPE* up, // [token_len * expert_topK, hidden_size] + const __global MOE_DTYPE* gate, + __global MOE_DTYPE* output // [token_len * expert_topK, hidden_size] +) { + const uint token_idx = get_global_id(0); + const uint n_offset = get_global_id(1); + + const uint offset = token_idx * HIDDEN_SIZE + n_offset; +#if MOE_DTYPE_SIZE == 2 + half up_value = as_half(intel_sub_group_block_read_us((const __global ushort *)(up + offset))); + half gate_value = as_half(intel_sub_group_block_read_us((const __global ushort *)(gate + offset))); + half value = gate_value / (MOE_DTYPE(1.0) + native_exp(-SWISH_BETA * gate_value)); + MOE_DTYPE result = value * up_value; + intel_sub_group_block_write_us((__global ushort *)(output + offset), as_ushort(result)); +#else + MOE_DTYPE gate_value = gate[offset]; + MOE_DTYPE up_value = up[offset]; + half value = gate_value / (MOE_DTYPE(1.0) + native_exp(-SWISH_BETA * gate_value)); + MOE_DTYPE result = value * up_value; + output[offset] = result; +#endif +} + #endif diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl index 30ac473696c60c..8870cba94c6ac4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl @@ -66,8 +66,12 @@ KERNEL(moe_gemm)(OPTIONAL_SHAPE_INFO_ARG #ifdef WEIGHT_COMPRESSED_INT4 weight_scales += experts_ids[batch] * m * NUM_GROUPS; #ifdef WEIGHT_ZP_DT + #ifdef WEIGHT_COMPRESSED_ZP_INT4 + weight_zps += experts_ids[batch] * m * NUM_GROUPS / 2; + #else weight_zps += experts_ids[batch] * m * NUM_GROUPS; #endif + #endif #endif int ld_weight = k; int cur_n_tokens = n_array[batch]; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl index f3e96aa905864f..5412f14606f83f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl @@ -19,6 +19,9 @@ KERNEL(moe_scatter_reduction_ref)( const __global INPUT4_TYPE* experts_start_offset, const __global INPUT5_TYPE* tokens_len_per_expert, const __global INPUT6_TYPE* experts_ids, +#ifdef SET_ACTUAL_USED_EXPERTS_NUM + const uint actual_used_expert_num, +#endif __global OUTPUT_TYPE* output ) { @@ -32,7 +35,11 @@ KERNEL(moe_scatter_reduction_ref)( if (threads_index < ACTIVE_EXPERTS) { INPUT1_TYPE expert_id = experts_per_token[token_group_id * ACTIVE_EXPERTS + threads_index]; +#ifdef SET_ACTUAL_USED_EXPERTS_NUM + for (int i = 0; i < actual_used_expert_num; i++) { +#else for (int i = 0; i < INPUT6_BATCH_NUM; i++) { +#endif if (experts_ids[i] == expert_id) { start_offset_index[threads_index] = i; break; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_moe_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_moe_to_compressed.cpp index 13010a5f6b28e8..fb7a62d8ef34be 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_moe_to_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_moe_to_compressed.cpp @@ -234,6 +234,7 @@ ConvertMOEToMOECompressed::ConvertMOEToMOECompressed(bool is_pa) { } config.top_k = topk_shape[1].get_length(); config.out_type = ov::element::f16; + config.has_batch_dim = is_pa ? 0 : 1; auto moe_compressed = std::make_shared(args, config); moe_compressed->set_friendly_name(moe->get_friendly_name()); From 8790e20c002b244eb3ad17842bd509ea2ecaa272 Mon Sep 17 00:00:00 2001 From: River Date: Fri, 21 Nov 2025 11:15:00 +0800 Subject: [PATCH 02/20] Fixed out of resource issue --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 67 ++++++-- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp | 8 +- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 149 ++++++++++++++---- .../impls/ocl_v2/moe_3gemm_swiglu_fuse.cl | 4 +- .../impls/ocl_v2/moe_scatter_reduction_opt.cl | 1 - 5 files changed, 176 insertions(+), 53 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index 252612601ec4e1..3745c7c3e31bd8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -36,6 +36,8 @@ static size_t get_subgroup_size(gpu_arch arch) { JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& params, const micro::Package& moe_gemm, const moe_3gemm_config& cfg) const { const auto& device_info = params.get_device_info(); + + std::cout << "MoE3GemmMicroGenerator::get_jit_constants() - " << __LINE__ << std::endl; auto jit = make_base_jit_constants(params); jit.make("SUBGROUP_SIZE", get_subgroup_size(device_info.arch)); jit.make("OUTPUT_TYPE", to_ocl_type(data_types::f16)); // output @@ -48,29 +50,41 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& jit.make("WEIGHT_ZP_DT", to_ocl_type(data_types::u4)); // zp jit.make("WEIGHT_COMPRESSED_INT4", 1); jit.make("IS_GENERATE", 0); // prefill + + std::cout << "\t m_scale_idx: " << m_scale_idx << std::endl; + std::cout << "\t params.input_layouts[m_scale_idx].get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; if (cfg.weight_group_size > 0) - jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[2]); + jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[1]); else jit.make("NUM_GROUPS", 1); + std::cout << "\t m_wei_idx: " << m_wei_idx << std::endl; + std::cout << "\t params.input_layouts[m_wei_idx].get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; const auto& weight_shape = params.input_layouts[m_wei_idx].get_shape(); // u4:bfyx:4x3072x8x128:nopad size_t expert_stride = weight_shape.size() == 4 ? (weight_shape[1] * weight_shape[2] * weight_shape[3]) : (weight_shape[1] * weight_shape[2]); jit.make("EXPERT_STRIDE", expert_stride / 2); - const auto& input_shape = params.input_layouts[0].get_shape(); - jit.make("INPUT_SEQ_LEN", input_shape[0]); + std::cout << "\t params.input_layouts[0].get_shape(): " << params.input_layouts[0].to_short_string() << std::endl; + const auto& input_shape = params.input_layouts[0].get_partial_shape(); + //jit.make("INPUT_SEQ_LEN", input_shape[0]); + jit.make("INPUT_SEQ_LEN", 0); // prefill not use it + // f16:bfyx:[?,2048]:nopad jit.make("INPUT_STRIDE", input_shape.size() == 3 ? input_shape[1] * input_shape[2] : input_shape[1]); - const auto& output_shape = params.output_layouts[0].get_shape(); + std::cout << "\t params.output_layouts[0].get_shape(): " << params.output_layouts[0].to_short_string() << std::endl; + const auto& output_shape = params.output_layouts[0].get_partial_shape(); jit.make("OUTPUT_STRIDE", output_shape.size() == 3 ? output_shape[1] * output_shape[2] : output_shape[1]); jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); + jit.make("OPTIONAL_SHAPE_INFO_ARG",""); auto slm_size = moe_gemm.getSetting("slm_size"); if (slm_size > 0) jit.make("USE_SLM", 1); + + std::cout << "MoE3GemmMicroGenerator::get_jit_constants() done " << std::endl; return jit; } @@ -134,10 +148,10 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, size_t m = weight_shape[1]; size_t n = is_prefill ? 32 : 8; size_t k = weight_shape.size() == 4 ? weight_shape[2] * weight_shape[3] : weight_shape[2]; - GPU_DEBUG_TRACE_DETAIL << "init_microkernels for " << (is_prefill ? "prefill" : "generate") << " : Seq_len:" << n << " Ofm:" << m << " K:" << k << "\n"; + std::cout << "init_microkernels for " << (is_prefill ? "prefill" : "generate") << " : Seq_len:" << n << " Ofm:" << m << " K:" << k << "\n"; size_t group_size = weight_shape.size() == 4 ? weight_shape[3] : weight_shape[2]; - GPU_DEBUG_TRACE_DETAIL << "weight group size: " << group_size << "\n"; + std::cout << "weight group size: " << group_size << "\n"; micro::GEMMProblem problem_moe; micro::GEMMProtocol::Options opts_moe; @@ -190,19 +204,22 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, sizes.k = static_cast(k); sizes.batch = static_cast(1); - GPU_DEBUG_TRACE_DETAIL << "problem_moe:" << problem_moe.toString() << "\n"; - GPU_DEBUG_TRACE_DETAIL << "sizes to select gemm : m : " << m << " n : " << n << " k : " << k << std::endl; + std::cout << "problem_moe:" << problem_moe.toString() << "\n"; + std::cout << "sizes to select gemm : m : " << m << " n : " << n << " k : " << k << std::endl; try { /* Ask microkernel provider for microkernel */ gemm_moe = micro::select_gemm_microkernel(opts_moe, hw_info, sizes, problem_moe); } catch (const std::runtime_error& ex) { OPENVINO_THROW("Can't create moe micro kernel: ", ex.what()); } + std::cout << "init_microkernels is done" << std::endl; } DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { - return DispatchDataFunc{[this](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + const auto wei_idx = this->m_wei_idx; + return DispatchDataFunc{[this, wei_idx](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { assert(!params.is_dynamic()); + std::cout << "MoE3GemmMicroGenerator::DispatchDataFunc()" << std::endl; auto* rtp = static_cast(rt_params); const auto& device_info = params.get_device_info(); const auto& gemm_p = kd.micro_kernels[0]->p; @@ -217,7 +234,11 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { scalars.reserve(3); auto input_layout = params.get_input_layout(0); - auto experts_weight_layout = params.get_input_layout(m_wei_idx); + auto experts_weight_layout = params.get_input_layout(wei_idx); + + std::cout << "\t input_layout: " << input_layout.to_short_string() << std::endl; + std::cout << "\t wei_idx = " << wei_idx << std::endl; + std::cout << "\t experts_weight_layout: " << experts_weight_layout.to_short_string() << std::endl; // has_batch_dim indicates whether the input tensor has batch dimension size_t n = input_layout.get_shape().size() == 3 ? input_layout.get_shape()[1] : input_layout.get_shape()[0]; @@ -248,14 +269,15 @@ std::string MoE3GemmMicroGenerator::get_build_options(const kernel_impl_params& Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& params) const { Arguments args; - if (params.is_dynamic()) - args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); + // if (params.is_dynamic()) + // args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); // auto cfg = get_moe_cfg(params); switch (m_type) { case MoE3GemmMicroKernelType::MLP_GATE: args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_0)}); + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // gate output args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array @@ -267,6 +289,7 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p case MoE3GemmMicroKernelType::MLP_UP: args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_1)}); + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); // up output args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array @@ -278,6 +301,7 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p case MoE3GemmMicroKernelType::MLP_DOWN: args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // intermediate_mem[6] args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_2)}); + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3}); // down output args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array @@ -303,8 +327,10 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par OPENVINO_THROW("MoE3GemmMicroGenerator::get_kernel_data() - can't init microkernels: ", ex.what()); } + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; auto jit = get_jit_constants(params, moe_gemm, get_moe_3gemm_cfg(params)); + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; KernelData kd; kd.code = std::make_shared(); kd.code->language = kernel_language::OCLC_V2; @@ -314,11 +340,22 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par kd.code->options = get_build_options(params); kd.code->batch_compilation = false; kd.code->has_microkernels = true; - kd.code->str = build_code(get_kernel_name(), jit, kd.code->entry_point); + + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; + try { + std::cout << "\t get_kernel_name(): " << get_kernel_name() << std::endl; + std::cout << "\t kd.code->entry_point: " << kd.code->entry_point << std::endl; + kd.code->str = build_code(get_kernel_name(), jit, kd.code->entry_point); + } catch (const std::runtime_error& ex) { + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - can't build code: " << ex.what() << std::endl; + OPENVINO_THROW("MoE3GemmMicroGenerator::get_kernel_data() - can't build code: ", ex.what()); + } + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.params.arguments = get_arguments_desc(params); kd.update_dispatch_data_func = get_dispatch_data_func(); + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.need_args_update = true; kd.need_dispatch_data_update = true; @@ -328,12 +365,14 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par shim_options.subgroupSize = static_cast(get_subgroup_size(device_info.arch)); shim_options.useTileOps = true; shim_options.decorator = "moe"; + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.code->jit += generateShim(moe_gemm, micro::HostLanguage::OpenCL_C, shim_options); if (moe_gemm.grfMin > 128) { kd.code->options += " -cl-intel-256-GRF-per-thread"; } + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.micro_kernels.push_back(std::make_shared(moe_gemm)); // Micro kernel is using slm implicitly inside the kernel. @@ -344,6 +383,8 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par kd.params.local_memory_args.push_back(slm_size); kd.params.arguments.push_back({ArgumentDescriptor::Types::LOCAL_MEMORY_SIZE, slm_size}); } + + std::cout << "MoE3GemmMicroGenerator::get_kernel_data() completed\n"; return kd; } } // namespace ov::intel_gpu::ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp index 4e0acffd8f1ade..ada213c48fc559 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp @@ -24,10 +24,10 @@ namespace ov::intel_gpu::ocl { class MoE3GemmMicroGenerator : public MoEGemmOptGeneratorBase { public: explicit MoE3GemmMicroGenerator(MoE3GemmMicroKernelType type) - : MoEGemmOptGeneratorBase("moe_3gemm_prefill_mlp", - type == MoE3GemmMicroKernelType::MLP_GATE ? "_gate" - : type == MoE3GemmMicroKernelType::MLP_UP ? "_up" - : "_down"), + : MoEGemmOptGeneratorBase("moe_gemm", + type == MoE3GemmMicroKernelType::MLP_GATE ? "_prefill_mlp_gate" + : type == MoE3GemmMicroKernelType::MLP_UP ? "_prefill_mlp_up" + : "_prefill_mlp_down"), m_type(type) { switch (m_type) { case MoE3GemmMicroKernelType::MLP_GATE: diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 2a161a469a1eb2..0b2761ff64f9e5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -2,9 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "moe_3gemm_swiglu_opt.hpp" - +// clang-format off #include "moe_3gemm_gen_micro.hpp" +#include "moe_3gemm_swiglu_opt.hpp" +// clang-format on #ifdef ENABLE_ONEDNN_FOR_GPU # include @@ -361,7 +362,9 @@ class MoE3GemmSwigluSoftMaxTopK : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluSoftMaxTopK::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -390,7 +393,9 @@ class MoE3GemmSwigluGather : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluGather::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -460,6 +465,12 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { jit.make("VEC_BLK_SIZE", block_size); jit.make("BATCHES_PER_THREAD", batches_per_thread); jit.make("UNALIGNED_ELEMENTS", unaligned_elements); + + jit.make("INPUT0_TYPE", "half"); + jit.make("INPUT1_TYPE", "int"); + jit.make("OUTPUT_TYPE", "half"); + jit.make("OPTIONAL_SHAPE_INFO_ARG",""); + return jit; } @@ -470,7 +481,9 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluPrefillGather::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -499,7 +512,9 @@ class MoE3GemmSwigluPrefillSwiglu : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluPrefillSwiglu::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -518,6 +533,7 @@ class MoE3GemmSwigluPrefillScatterReduce : public KernelGenerator { auto block_size = 4; auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(params), block_size, hidden_size); + jit.make("OPTIONAL_SHAPE_INFO_ARG",""); jit.make("ACTIVE_EXPERTS", desc->_config.top_k); jit.make("HIDDEN_SIZE", hidden_size); jit.make("VEC_BLK_SIZE", 4); @@ -543,7 +559,9 @@ class MoE3GemmSwigluPrefillScatterReduce : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluPrefillScatterReduce::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -568,7 +586,9 @@ class MoE3GemmSwigluScatter : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluScatter::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -621,7 +641,9 @@ class MoE3GemmSwigluMLPGateUp : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluMLPGateUp::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -644,7 +666,9 @@ class MoE3GemmSwigluMLPDown : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluMLPDown::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -667,7 +691,9 @@ class MoE3GemmSwigluMLPReduce : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{nullptr}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + std::cout << "MoE3GemmSwigluMLPReduce::get_dispatch_data_func()" << std::endl; + }}; } }; @@ -676,6 +702,7 @@ dnnl::memory convert2dnnl(const memory::ptr& ptr, const std::vector& di return ptr->get_onednn_memory(dnnl::memory::desc(dnnl::memory::dims(dim), convert_data_type(ptr->get_layout().data_type), tag), offset); } +static bool use_micro_gemm_prefill = true; class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { public: DECLARE_OBJECT_TYPE_SERIALIZATION(ov::intel_gpu::ocl::MoE3GemmSwigluImpl) @@ -752,8 +779,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { int _gate_up_group_size; int _down_group_size; - bool use_micro_gemm_prefill = true; - moe_3gemm_swiglu_opt_impl() : PrimitiveImplOCL(moe_3gemm_swiglu_opt::get_type_info_static()) {} moe_3gemm_swiglu_opt_impl(const program_node& node, const RuntimeParams& params) : moe_3gemm_swiglu_opt_impl() { init(node.as().get_primitive()); @@ -766,6 +791,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(mlp_reduce, params); auto use_micro_gemm_prefill_str = std::getenv("MOE_USE_MICRO_GEMM_PREFILL"); + std::cout << "MOE_USE_MICRO_GEMM_PREFILL = " << use_micro_gemm_prefill_str << std::endl; + if (use_micro_gemm_prefill_str) use_micro_gemm_prefill = std::stoi(use_micro_gemm_prefill_str); else @@ -773,11 +800,17 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { if (use_micro_gemm_prefill) { add_stage(prefill_gather, params); + std::cout << "prefill_gather" << std::endl; add_stage(micro_gemm_gate, params); + std::cout << "micro_gemm_gate" << std::endl; add_stage(micro_gemm_up, params); + std::cout << "micro_gemm_up" << std::endl; add_stage(micro_gemm_down, params); + std::cout << "micro_gemm_down" << std::endl; add_stage(prefill_swiglu, params); + std::cout << "prefill_swiglu" << std::endl; add_stage(prefill_scatter_reduce, params); + std::cout << "prefill_scatter_reduce" << std::endl; } } @@ -1019,11 +1052,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { cldnn::stream& stream = instance.get_network().get_stream(); cldnn::kernel_arguments_data args; cldnn::kernel_arguments_desc desc; + + std::cout << "execute_stage: " << std::endl; for (uint32_t i = 0; i < inputs.size(); i++) { desc.arguments.push_back({ArgumentDescriptor::Types::INPUT, i}); args.inputs.push_back(inputs[i]); + std::cout << "\tinput[" << i << "]: " << inputs[i]->get_layout().to_short_string() << std::endl; } - cldnn::scalars_desc scalar_desc; if (!scalar_inputs.empty()) { scalar_desc.resize(scalar_inputs.size()); @@ -1033,17 +1068,30 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { scalar_desc[i].v.s32 = scalar_inputs[i]; } args.scalars = &scalar_desc; + std::cout << "\tscalar_inputs: " << std::endl; + for (const auto& scalar : scalar_inputs) { + std::cout << "\t\t" << scalar << std::endl; + } } for (uint32_t i = 0; i < outputs.size(); i++) { desc.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, i}); args.outputs.push_back(outputs[i]); + std::cout << "\toutput[" << i << "]: " << outputs[i]->get_layout().to_short_string() << std::endl; } stream.set_arguments(*stage.kernel, desc, args); desc.workGroups.global = global; desc.workGroups.local = local; + if(global.size() == 2) { + std::cout << "\tgws[] = " << global[0] << ", " << global[1] << std::endl; + std::cout << "\tlws[] = " << local[0] << ", " << local[1] << std::endl; + } else if(global.size() == 3) { + std::cout << "\tgws[] = " << global[0] << ", " << global[1] << ", " << global[2] << std::endl; + std::cout << "\tlws[] = " << local[0] << ", " << local[1] << ", " << local[2] << std::endl; + } + return stream.enqueue_kernel(*stage.kernel, desc, {}, events, needs_completion_event); } @@ -1171,31 +1219,33 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } } } + auto rtp = static_cast(m_rt_params.get()); + rtp->num_actually_used_experts = num_actually_used_experts; // debug print { - std::cout << "step 1: prefill_mask num_actually_used_experts=" << num_actually_used_experts << std::endl; - std::cout << "expert_id[" << num_actually_used_experts << "]: = " << std::endl; + std::cout << "\nstep 1: prefill_mask num_actually_used_experts=" << num_actually_used_experts << std::endl; + std::cout << "expert_id[" << num_actually_used_experts << "]: = "; for (int i = 0; i < num_actually_used_experts; i++) { - std::cout << experts_id_lock[i] << ", " << std::endl; + std::cout << experts_id_lock[i] << ", "; } std::cout << std::endl; - std::cout << "experts_info_start_idx[" << num_actually_used_experts << "]: = " << std::endl; + std::cout << "experts_info_start_idx[" << num_actually_used_experts << "]: = "; for (int i = 0; i < num_actually_used_experts; i++) { - std::cout << experts_info_start_idx_lock[i] << ", " << std::endl; + std::cout << experts_info_start_idx_lock[i] << ", "; } std::cout << std::endl; - std::cout << "tokens_len_per_expert[" << num_actually_used_experts << "]: = " << std::endl; + std::cout << "tokens_len_per_expert[" << num_actually_used_experts << "]: = "; for (int i = 0; i < num_actually_used_experts; i++) { - std::cout << tokens_lens_per_expert_lock[i] << ", " << std::endl; + std::cout << tokens_lens_per_expert_lock[i] << ", "; } std::cout << std::endl; std::cout << "tokens_per_expert[" << num_actually_used_experts << "]:" << std::endl; int token_idx = 0; for (int i = 0; i < num_actually_used_experts; i++) { - std::cout << "\texpert[" << i << "]: = " << std::endl; + std::cout << "\texpert[" << i << "]: = "; for (int j = 0; j < tokens_lens_per_expert_lock[i]; j++) { - std::cout << tokens_per_expert_lock[token_idx + j] << ", " << std::endl; + std::cout << tokens_per_expert_lock[token_idx + j] << ", "; } token_idx += tokens_lens_per_expert_lock[i]; std::cout << std::endl; @@ -1207,17 +1257,17 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // step 2: generate gather input tokens // input // 0: input tensor, shape = [token_len, hidden_size] - // 1: token idx per expert, static shape = [expert_num * topK_num] + // 1: token idx per expert, static shape = [token_num * topK_num] // output // 0: gathered token: shape = [token_len * expert_topK, hidden_size] - { + if(1){ auto hidden_size = _hidden_size; auto block_size = GetBlockSize(*instance.get_impl_params()); auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(*instance.get_impl_params()), block_size, hidden_size); - auto token_per_expert = 1; + auto token_per_expert = intermediates_memories[12]->get_layout().get_shape()[0]; - std::cout << "step 2: prefill_gather local_threads_count=" << local_threads_count << ", batches_per_thread=" << batches_per_thread + std::cout << "\nstep 2: prefill_gather local_threads_count=" << local_threads_count << ", batches_per_thread=" << batches_per_thread << ", unaligned_elements=" << unaligned_elements << ", token_per_expert=" << token_per_expert << std::endl; ret_event = execute_stage(events, instance, @@ -1226,6 +1276,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {scratch.x}, {static_cast(token_per_expert * local_threads_count), 1, 1}, {static_cast(local_threads_count), 1, 1}); + + stream.finish(); //debug } // step 3: moe_gemm for up and gate @@ -1241,9 +1293,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 8: wei_zp // output: // 0: up/gate output, shape = [token_len * expert_topK, hidden_size] - { + if(1){ + std::cout << "\nstep 3: moe_gemm for up and gate" << std::endl; ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_up); + ret_event->wait(); //debug + stream.finish(); //debug ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_gate); + ret_event->wait(); //debug + stream.finish(); //debug } // step 4: post proc - gate_up = silu(gate)*up, silu(x)=x*sigmod(x)=x*(1+exp(-x)) @@ -1256,7 +1313,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); auto token_size = input_shape[0] * max_topk; - std::cout << "step 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _hidden_size << std::endl; + std::cout << "\nstep 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _hidden_size << std::endl; ret_event = execute_stage({ret_event}, instance, @@ -1265,6 +1322,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {intermediates_memories[6]}, {static_cast(token_size), static_cast(_hidden_size), 1}, {1, subgroup_size, 1}); + + ret_event->wait(); //debug + stream.finish(); //debug } // step 5: moe_gemm for down @@ -1281,8 +1341,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // output: // 0: down output, shape = [token_len * expert_topK, hidden_size] - { + if(1){ + std::cout << "\nstep 5: moe_gemm for down" << std::endl; ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_down); + ret_event->wait(); //debug + stream.finish(); //debug } // step 6: scatter and reduce @@ -1297,12 +1360,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // output: // 0: final hidden states, shape = [token_len, hidden_size] - { + if(1){ auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); auto token_size = input_shape[0] * max_topk; auto [local_threads_count, batches_per_thread, _] = calc_thread_count(const_cast(*instance.get_impl_params()), 4, _hidden_size); - std::cout << "step 6: prefill_scatter_reduce token_size=" << token_size << ", local_threads_count=" << local_threads_count + std::cout << "\nstep 6: prefill_scatter_reduce token_size=" << token_size << ", local_threads_count=" << local_threads_count << ", num_actually_used_experts = " << num_actually_used_experts << std::endl; ret_event = execute_stage({ret_event}, @@ -1320,11 +1383,30 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {local_threads_count, 1, 1}, instance.needs_completion_event(), {num_actually_used_experts}); + + ret_event->wait(); //debug + stream.finish(); //debug } return ret_event; } + void update_rt_params(const primitive_inst& instance) override { + if (m_rt_params == nullptr) { + m_rt_params = std::make_unique(); + } + update_stages_flags(instance); + auto rtp = static_cast(m_rt_params.get()); + // rtp->num_actually_used_experts = instance.get_input_layout(moe_gemm::MoEGemmInputIdx::EXPERTS_IDS).get_shape()[0]; + std::cout << "moe_3gemm :: num_actually_used_experts = " << rtp->num_actually_used_experts << "\n"; + } + + void update(primitive_inst& inst, const kernel_impl_params& impl_params) override { + PrimitiveImplOCL::update(inst, impl_params); + inst.update_shape_info_tensor(impl_params); + update_rt_params(inst); + } + struct onednn_kernel { onednn_linear up; onednn_linear gate; @@ -1464,7 +1546,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { get_expert_mask_from_gpu(config, topk_id_mem, stream, expert_mask); if (use_micro_gemm_prefill) { - std::cout << "Use micro_gemm prefill path" << std::endl; + std::cout << "\nUse micro_gemm prefill path" << std::endl; + update_rt_params(instance); return exec_prefill_opt({topk_event}, instance, scratch, expert_mask); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl index f64d05ff0264bd..ef5260746b75ea 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl @@ -146,13 +146,13 @@ KERNEL(swiglu_ref) ( #if MOE_DTYPE_SIZE == 2 half up_value = as_half(intel_sub_group_block_read_us((const __global ushort *)(up + offset))); half gate_value = as_half(intel_sub_group_block_read_us((const __global ushort *)(gate + offset))); - half value = gate_value / (MOE_DTYPE(1.0) + native_exp(-SWISH_BETA * gate_value)); + half value = gate_value / (1.0 + native_exp(-SWISH_BETA * gate_value)); MOE_DTYPE result = value * up_value; intel_sub_group_block_write_us((__global ushort *)(output + offset), as_ushort(result)); #else MOE_DTYPE gate_value = gate[offset]; MOE_DTYPE up_value = up[offset]; - half value = gate_value / (MOE_DTYPE(1.0) + native_exp(-SWISH_BETA * gate_value)); + half value = gate_value / (1.0 + native_exp(-SWISH_BETA * gate_value)); MOE_DTYPE result = value * up_value; output[offset] = result; #endif diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl index 5412f14606f83f..edfee2fbc30959 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_scatter_reduction_opt.cl @@ -3,7 +3,6 @@ // #include "include/batch_headers/common.cl" -#include "include/fetch_utils.cl" #define VLOAD CAT(vload, VEC_BLK_SIZE) #define VSTORE CAT(vstore, VEC_BLK_SIZE) From 39d99c5716c65f2f6b1b19ddbd0ae92facee2081 Mon Sep 17 00:00:00 2001 From: River Date: Fri, 21 Nov 2025 22:36:15 +0800 Subject: [PATCH 03/20] Add debug code to dump kernel's input/output --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 9 --- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 75 +++++++++++++------ 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index 3745c7c3e31bd8..8b1cce010f5898 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -37,7 +37,6 @@ static size_t get_subgroup_size(gpu_arch arch) { JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& params, const micro::Package& moe_gemm, const moe_3gemm_config& cfg) const { const auto& device_info = params.get_device_info(); - std::cout << "MoE3GemmMicroGenerator::get_jit_constants() - " << __LINE__ << std::endl; auto jit = make_base_jit_constants(params); jit.make("SUBGROUP_SIZE", get_subgroup_size(device_info.arch)); jit.make("OUTPUT_TYPE", to_ocl_type(data_types::f16)); // output @@ -327,10 +326,8 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par OPENVINO_THROW("MoE3GemmMicroGenerator::get_kernel_data() - can't init microkernels: ", ex.what()); } - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; auto jit = get_jit_constants(params, moe_gemm, get_moe_3gemm_cfg(params)); - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; KernelData kd; kd.code = std::make_shared(); kd.code->language = kernel_language::OCLC_V2; @@ -341,7 +338,6 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par kd.code->batch_compilation = false; kd.code->has_microkernels = true; - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; try { std::cout << "\t get_kernel_name(): " << get_kernel_name() << std::endl; std::cout << "\t kd.code->entry_point: " << kd.code->entry_point << std::endl; @@ -350,12 +346,10 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - can't build code: " << ex.what() << std::endl; OPENVINO_THROW("MoE3GemmMicroGenerator::get_kernel_data() - can't build code: ", ex.what()); } - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.params.arguments = get_arguments_desc(params); kd.update_dispatch_data_func = get_dispatch_data_func(); - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.need_args_update = true; kd.need_dispatch_data_update = true; @@ -365,14 +359,11 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par shim_options.subgroupSize = static_cast(get_subgroup_size(device_info.arch)); shim_options.useTileOps = true; shim_options.decorator = "moe"; - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.code->jit += generateShim(moe_gemm, micro::HostLanguage::OpenCL_C, shim_options); if (moe_gemm.grfMin > 128) { kd.code->options += " -cl-intel-256-GRF-per-thread"; } - - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - " << __LINE__ << std::endl; kd.micro_kernels.push_back(std::make_shared(moe_gemm)); // Micro kernel is using slm implicitly inside the kernel. diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 0b2761ff64f9e5..5e17cf6b28ff6e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -791,26 +791,19 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(mlp_reduce, params); auto use_micro_gemm_prefill_str = std::getenv("MOE_USE_MICRO_GEMM_PREFILL"); - std::cout << "MOE_USE_MICRO_GEMM_PREFILL = " << use_micro_gemm_prefill_str << std::endl; - - if (use_micro_gemm_prefill_str) + if (use_micro_gemm_prefill_str) { + std::cout << "MOE_USE_MICRO_GEMM_PREFILL = " << use_micro_gemm_prefill_str << std::endl; use_micro_gemm_prefill = std::stoi(use_micro_gemm_prefill_str); - else + } else { use_micro_gemm_prefill = true; - + } if (use_micro_gemm_prefill) { add_stage(prefill_gather, params); - std::cout << "prefill_gather" << std::endl; add_stage(micro_gemm_gate, params); - std::cout << "micro_gemm_gate" << std::endl; add_stage(micro_gemm_up, params); - std::cout << "micro_gemm_up" << std::endl; add_stage(micro_gemm_down, params); - std::cout << "micro_gemm_down" << std::endl; add_stage(prefill_swiglu, params); - std::cout << "prefill_swiglu" << std::endl; add_stage(prefill_scatter_reduce, params); - std::cout << "prefill_scatter_reduce" << std::endl; } } @@ -1053,7 +1046,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { cldnn::kernel_arguments_data args; cldnn::kernel_arguments_desc desc; - std::cout << "execute_stage: " << std::endl; + std::cout << "moe::execute_stage: " << stage.kernel->get_id() << std::endl; for (uint32_t i = 0; i < inputs.size(); i++) { desc.arguments.push_back({ArgumentDescriptor::Types::INPUT, i}); args.inputs.push_back(inputs[i]); @@ -1068,10 +1061,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { scalar_desc[i].v.s32 = scalar_inputs[i]; } args.scalars = &scalar_desc; - std::cout << "\tscalar_inputs: " << std::endl; + std::cout << "\tscalar_inputs: "; for (const auto& scalar : scalar_inputs) { - std::cout << "\t\t" << scalar << std::endl; + std::cout << scalar << " "; } + std::cout << std::endl; } for (uint32_t i = 0; i < outputs.size(); i++) { @@ -1084,12 +1078,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { desc.workGroups.global = global; desc.workGroups.local = local; - if(global.size() == 2) { - std::cout << "\tgws[] = " << global[0] << ", " << global[1] << std::endl; - std::cout << "\tlws[] = " << local[0] << ", " << local[1] << std::endl; - } else if(global.size() == 3) { - std::cout << "\tgws[] = " << global[0] << ", " << global[1] << ", " << global[2] << std::endl; - std::cout << "\tlws[] = " << local[0] << ", " << local[1] << ", " << local[2] << std::endl; + if (global.size() == 2) { + std::cout << "\tgws = {" << global[0] << ", " << global[1] << "}" << std::endl; + std::cout << "\tlws = {" << local[0] << ", " << local[1] << "}" << std::endl; + } else if (global.size() == 3) { + std::cout << "\tgws = {" << global[0] << ", " << global[1] << ", " << global[2] << "}" << std::endl; + std::cout << "\tlws = {" << local[0] << ", " << local[1] << ", " << local[2] << "}" << std::endl; } return stream.enqueue_kernel(*stage.kernel, desc, {}, events, needs_completion_event); @@ -1254,6 +1248,39 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } } + auto print_mem_f16 = [&](cldnn::stream& stream, memory::ptr mem, const std::string& mem_name) { + auto layout = mem->get_layout().get_shape(); + size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; + size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; + cldnn::mem_lock lock_data{mem, stream}; + std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; + for (size_t j = 0; j < row; j++) { + std::cout << "\t[" << j << "]: "; + for (size_t i = 0; i < col && i < 16; i++) { + ov::float16 v = ov::float16::from_bits(lock_data[j * col + i]); + std::cout << static_cast(v) << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + }; + + auto print_mem = [&](cldnn::stream& stream, memory::ptr mem, const std::string& mem_name) { + auto layout = mem->get_layout().get_shape(); + size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; + size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; + cldnn::mem_lock lock_data{mem, stream}; + std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; + for (size_t j = 0; j < row; j++) { + std::cout << "\t[" << j << "]: "; + for (size_t i = 0; i < col; i++) { + std::cout << lock_data[j * col + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + }; + // step 2: generate gather input tokens // input // 0: input tensor, shape = [token_len, hidden_size] @@ -1278,6 +1305,10 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(local_threads_count), 1, 1}); stream.finish(); //debug + print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); + print_mem(stream, intermediates_memories[12], "token idx per expert"); + print_mem_f16(stream, scratch.x, "gathered token"); + std::cout << std::endl; } // step 3: moe_gemm for up and gate @@ -1396,9 +1427,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { m_rt_params = std::make_unique(); } update_stages_flags(instance); - auto rtp = static_cast(m_rt_params.get()); - // rtp->num_actually_used_experts = instance.get_input_layout(moe_gemm::MoEGemmInputIdx::EXPERTS_IDS).get_shape()[0]; - std::cout << "moe_3gemm :: num_actually_used_experts = " << rtp->num_actually_used_experts << "\n"; + // auto rtp = static_cast(m_rt_params.get()); } void update(primitive_inst& inst, const kernel_impl_params& impl_params) override { From c6567d57e14bc5e8af3f14b537e8f0105572f965 Mon Sep 17 00:00:00 2001 From: River Date: Tue, 25 Nov 2025 10:05:43 +0800 Subject: [PATCH 04/20] Fix NAN issue --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 95 ++++++---- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 170 +++++++++++++----- 2 files changed, 189 insertions(+), 76 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index 8b1cce010f5898..349c019ef63c79 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -9,6 +9,7 @@ #include "moe_3gemm_gen_micro.hpp" #include "intel_gpu/graph/kernel_impl_params.hpp" +#include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp" // #include "intel_gpu/primitives/moe_gemm.hpp" #include "ocl_v2/utils/jitter.hpp" // #include "moe_gemm_inst.h" @@ -37,7 +38,14 @@ static size_t get_subgroup_size(gpu_arch arch) { JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& params, const micro::Package& moe_gemm, const moe_3gemm_config& cfg) const { const auto& device_info = params.get_device_info(); - auto jit = make_base_jit_constants(params); + // auto jit = make_base_jit_constants(params); + JitConstants jit; + auto entry_point = get_entry_point(params); + jit.add(make_jit_constant("KERNEL(name)", "__kernel void " + entry_point)); + jit.add(make_jit_constant("KERNEL_ID", entry_point)); + jit.make("OPTIONAL_SHAPE_INFO_ARG", ""); + jit.make("OPTIONAL_SHAPE_INFO_TENSOR", ""); + jit.make("SUBGROUP_SIZE", get_subgroup_size(device_info.arch)); jit.make("OUTPUT_TYPE", to_ocl_type(data_types::f16)); // output jit.make("INPUT0_TYPE", to_ocl_type(data_types::f16)); // input: f16 @@ -45,17 +53,12 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& jit.make("INPUT2_TYPE", to_ocl_type(data_types::i32)); // experts_ids: i32 jit.make("INPUT3_TYPE", to_ocl_type(data_types::i32)); // input_offset_per_expert: i32 jit.make("INPUT4_TYPE", to_ocl_type(data_types::i32)); // n_array: i32 - jit.make("WEIGHT_SCALE_DT", to_ocl_type(data_types::f16)); // scale - jit.make("WEIGHT_ZP_DT", to_ocl_type(data_types::u4)); // zp + jit.make("WEIGHT_SCALE_DT", to_ocl_type(data_types::f16)); // scale: f16 + jit.make("WEIGHT_ZP_DT", to_ocl_type(data_types::u8)); // zp: u4 jit.make("WEIGHT_COMPRESSED_INT4", 1); - jit.make("IS_GENERATE", 0); // prefill - - std::cout << "\t m_scale_idx: " << m_scale_idx << std::endl; - std::cout << "\t params.input_layouts[m_scale_idx].get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; - if (cfg.weight_group_size > 0) - jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[1]); - else - jit.make("NUM_GROUPS", 1); + jit.make("IS_GENERATE", 0); // prefill + jit.make("INPUT_SEQ_LEN", 4); // prefill not use it + jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); std::cout << "\t m_wei_idx: " << m_wei_idx << std::endl; std::cout << "\t params.input_layouts[m_wei_idx].get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; @@ -63,23 +66,41 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& // u4:bfyx:4x3072x8x128:nopad size_t expert_stride = weight_shape.size() == 4 ? (weight_shape[1] * weight_shape[2] * weight_shape[3]) : (weight_shape[1] * weight_shape[2]); jit.make("EXPERT_STRIDE", expert_stride / 2); + std::cout << "\t expert_stride: " << expert_stride / 2 << std::endl; - std::cout << "\t params.input_layouts[0].get_shape(): " << params.input_layouts[0].to_short_string() << std::endl; - const auto& input_shape = params.input_layouts[0].get_partial_shape(); - //jit.make("INPUT_SEQ_LEN", input_shape[0]); - jit.make("INPUT_SEQ_LEN", 0); // prefill not use it - - // f16:bfyx:[?,2048]:nopad - jit.make("INPUT_STRIDE", input_shape.size() == 3 ? input_shape[1] * input_shape[2] : input_shape[1]); - - std::cout << "\t params.output_layouts[0].get_shape(): " << params.output_layouts[0].to_short_string() << std::endl; - const auto& output_shape = params.output_layouts[0].get_partial_shape(); - jit.make("OUTPUT_STRIDE", output_shape.size() == 3 ? output_shape[1] * output_shape[2] : output_shape[1]); + std::cout << "\t m_scale_idx: " << m_scale_idx << std::endl; + std::cout << "\t params.input_layouts[m_scale_idx].get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; + if (cfg.weight_group_size > 0) { + jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[1]); + std::cout << "\t NUM_GROUPS: " << params.input_layouts[m_scale_idx].get_shape()[1] << std::endl; + } else { + jit.make("NUM_GROUPS", 1); + std::cout << "\t NUM_GROUPS: 1" << std::endl; + } - jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); - jit.make("OPTIONAL_SHAPE_INFO_ARG",""); + auto desc = params.typed_desc(); + switch(m_type) { + case MoE3GemmMicroKernelType::MLP_GATE: + case MoE3GemmMicroKernelType::MLP_UP: + // f16:bfyx:[?,2048]:nopad + jit.make("INPUT_STRIDE", desc->_config.hidden_size); + jit.make("OUTPUT_STRIDE", desc->_config.inter_size); + std::cout << "\t INPUT_STRIDE: " << desc->_config.hidden_size << std::endl; + std::cout << "\t OUTPUT_STRIDE: " << desc->_config.inter_size << std::endl; + break; + case MoE3GemmMicroKernelType::MLP_DOWN: + jit.make("INPUT_STRIDE", desc->_config.inter_size); + jit.make("OUTPUT_STRIDE", desc->_config.hidden_size); + std::cout << "\t INPUT_STRIDE: " << desc->_config.inter_size << std::endl; + std::cout << "\t OUTPUT_STRIDE: " << desc->_config.hidden_size << std::endl; + break; + default: + OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); + break; + } auto slm_size = moe_gemm.getSetting("slm_size"); + std::cout << "MoE3GemmMicroGenerator::get_jit_constants() slm_size: " << slm_size << std::endl; if (slm_size > 0) jit.make("USE_SLM", 1); @@ -147,10 +168,12 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, size_t m = weight_shape[1]; size_t n = is_prefill ? 32 : 8; size_t k = weight_shape.size() == 4 ? weight_shape[2] * weight_shape[3] : weight_shape[2]; - std::cout << "init_microkernels for " << (is_prefill ? "prefill" : "generate") << " : Seq_len:" << n << " Ofm:" << m << " K:" << k << "\n"; + + std::cout << "MoE3GemmMicroGenerator::init_microkernels: " << std::endl; + std::cout << "\t m = " << m << ", n = " << n << ", k = " << k << std::endl; size_t group_size = weight_shape.size() == 4 ? weight_shape[3] : weight_shape[2]; - std::cout << "weight group size: " << group_size << "\n"; + std::cout << "\t weight group size: " << group_size << "\n"; micro::GEMMProblem problem_moe; micro::GEMMProtocol::Options opts_moe; @@ -165,7 +188,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, problem_moe.Ta_scale = convert_type(params.get_input_layout(scale_idx).data_type); // zp dt problem_moe.A_scale.setAlignment(2); - problem_moe.A_scale.layout = micro::MatrixLayout::T; + problem_moe.A_scale.layout = micro::MatrixLayout::N; problem_moe.asPtrDims = static_cast(MICRO_DIMENSIONALITY::MATRIX); problem_moe.aqGroupM = 1; @@ -178,7 +201,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, const auto zp_dt = convert_type(zp_layout.data_type); problem_moe.Tao = zp_dt; problem_moe.AO.setAlignment(zp_dt == gemmstone::Type::u4 ? 1 : static_cast(zp_dt.size())); - problem_moe.AO.layout = micro::MatrixLayout::T; + problem_moe.AO.layout = micro::MatrixLayout::N; problem_moe.aoPtrDims = static_cast(MICRO_DIMENSIONALITY::MATRIX); // Calculate A/B row/column sums in kernel. problem_moe.aOffset = micro::ABOffset::Calc; @@ -230,7 +253,7 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { auto& wgs = kd.params.workGroups; auto& scalars = kd.params.scalars; scalars.clear(); - scalars.reserve(3); + scalars.reserve(2); auto input_layout = params.get_input_layout(0); auto experts_weight_layout = params.get_input_layout(wei_idx); @@ -241,6 +264,11 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { // has_batch_dim indicates whether the input tensor has batch dimension size_t n = input_layout.get_shape().size() == 3 ? input_layout.get_shape()[1] : input_layout.get_shape()[0]; + auto cur_moe = params.typed_desc(); + const auto& config = cur_moe->_config; + n = n * config.top_k; + std::cout << "\t n = " << n << std::endl; + const auto& experts_weight_shape = experts_weight_layout.get_shape(); size_t m = experts_weight_shape[1]; size_t k = experts_weight_shape.size() == 4 ? experts_weight_shape[2] * experts_weight_shape[3] : experts_weight_shape[2]; @@ -254,6 +282,8 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { ScalarDescriptor s_k{ScalarDescriptor::Types::INT32}; s_k.v.s32 = static_cast(k); scalars.push_back(s_k); + + std::cout << "\t m = " << m << ", k = " << k << std::endl; }}; } @@ -279,7 +309,7 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // gate output args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale @@ -291,7 +321,7 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); // up output args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale @@ -303,7 +333,7 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3}); // down output args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale @@ -348,7 +378,6 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par } kd.params.arguments = get_arguments_desc(params); - kd.update_dispatch_data_func = get_dispatch_data_func(); kd.need_args_update = true; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 5e17cf6b28ff6e..7477ca0a1f79b6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -471,6 +471,10 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { jit.make("OUTPUT_TYPE", "half"); jit.make("OPTIONAL_SHAPE_INFO_ARG",""); + // std::cout << "MoE3GemmSwigluPrefillGather::get_jit_constants(): hidden_size: " << hidden_size << ", block_size: " << block_size + // << ", local_threads_count: " << local_threads_count << ", batches_per_thread: " << batches_per_thread + // << ", unaligned_elements: " << unaligned_elements << std::endl; + return jit; } @@ -542,7 +546,7 @@ class MoE3GemmSwigluPrefillScatterReduce : public KernelGenerator { jit.make("INPUT0_TYPE", "half"); jit.make("INPUT1_TYPE", "int"); - jit.make("INPUT2_TYPE", "int"); + jit.make("INPUT2_TYPE", "half"); jit.make("INPUT3_TYPE", "int"); jit.make("INPUT4_TYPE", "int"); jit.make("INPUT5_TYPE", "int"); @@ -715,7 +719,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { Stage::Ptr prefill_gather = make_stage(); Stage::Ptr micro_gemm_gate = make_stage(MoE3GemmMicroKernelType::MLP_GATE); - Stage::Ptr micro_gemm_up = make_stage(MoE3GemmMicroKernelType::MLP_UP); + Stage::Ptr micro_gemm_up = make_stage(MoE3GemmMicroKernelType::MLP_UP); Stage::Ptr micro_gemm_down = make_stage(MoE3GemmMicroKernelType::MLP_DOWN); Stage::Ptr prefill_swiglu = make_stage(); Stage::Ptr prefill_scatter_reduce = make_stage(); @@ -902,16 +906,16 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto max_batch = max_topk * batch; layout layout_gateup_out(ov::PartialShape{max_batch, static_cast(config.inter_size)}, data_type, cldnn::format::bfyx); layout layout_down_out(ov::PartialShape{max_batch, static_cast(config.hidden_size)}, data_type, cldnn::format::bfyx); - internal_buffers.emplace_back(layout_gateup_out, true); // 2: up - internal_buffers.emplace_back(layout_down_out, true); // 3: y + internal_buffers.emplace_back(layout_gateup_out, true); // 2: up output + internal_buffers.emplace_back(layout_down_out, true); // 3: down output // onednn: scratch.x, scratch.routing_weights = gather(x, ...) // scratch.up = up(scratch.x) // scratch.gate = gate(scratch.x) * scratch.up // scratch.y = down(scratch.gate) * routing_weights - internal_buffers.emplace_back(layout_down_out, true); // 4: x, scratch.x has same layout with down output + internal_buffers.emplace_back(layout_down_out, true); // 4: up/gate input, scratch.x has same layout with down output layout routing_layout(ov::PartialShape{batch * max_topk}, data_type, cldnn::format::bfyx); internal_buffers.emplace_back(routing_layout, true); // 5: routing_weights - internal_buffers.emplace_back(layout_gateup_out, true); // 6: gate, scratch.gate has same layout with up + internal_buffers.emplace_back(layout_gateup_out, true); // 6: gate output, scratch.gate has same layout with up // expert masks for gpu layout index_layout(ov::PartialShape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); internal_buffers.emplace_back(index_layout, true); // 7: batch @@ -1162,6 +1166,44 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { return ret; } + void print_mem_f16(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, size_t max_row = 1024) { + auto layout = mem->get_layout().get_shape(); + size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; + size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; + cldnn::mem_lock lock_data{mem, stream}; + std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; + for (size_t j = 0; j < row && j < max_row; j++) { + std::cout << "\t[" << j << "]: "; + for (size_t i = 0; i < col && i < 16; i++) { + ov::float16 v = ov::float16::from_bits(lock_data[j * col + i]); + std::cout << static_cast(v) << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + }; + + void print_mem(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, int max_print = 1024) { + auto layout = mem->get_layout().get_shape(); + size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; + size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; + cldnn::mem_lock lock_data{mem, stream}; + std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; + int print_cnt = 0; + for (size_t j = 0; j < row; j++) { + std::cout << "\t[" << j << "]: "; + for (size_t i = 0; i < col; i++) { + if (print_cnt++ >= max_print) { + std::cout << "..." << std::endl; + return; + } + std::cout << lock_data[j * col + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + }; + cldnn::event::ptr exec_prefill_opt(const std::vector& events, typed_primitive_inst& instance, scratch_buffers& scratch, @@ -1248,39 +1290,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } } - auto print_mem_f16 = [&](cldnn::stream& stream, memory::ptr mem, const std::string& mem_name) { - auto layout = mem->get_layout().get_shape(); - size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; - size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; - cldnn::mem_lock lock_data{mem, stream}; - std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; - for (size_t j = 0; j < row; j++) { - std::cout << "\t[" << j << "]: "; - for (size_t i = 0; i < col && i < 16; i++) { - ov::float16 v = ov::float16::from_bits(lock_data[j * col + i]); - std::cout << static_cast(v) << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - }; - - auto print_mem = [&](cldnn::stream& stream, memory::ptr mem, const std::string& mem_name) { - auto layout = mem->get_layout().get_shape(); - size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; - size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; - cldnn::mem_lock lock_data{mem, stream}; - std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; - for (size_t j = 0; j < row; j++) { - std::cout << "\t[" << j << "]: "; - for (size_t i = 0; i < col; i++) { - std::cout << lock_data[j * col + i] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - }; - // step 2: generate gather input tokens // input // 0: input tensor, shape = [token_len, hidden_size] @@ -1295,7 +1304,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto token_per_expert = intermediates_memories[12]->get_layout().get_shape()[0]; std::cout << "\nstep 2: prefill_gather local_threads_count=" << local_threads_count << ", batches_per_thread=" << batches_per_thread - << ", unaligned_elements=" << unaligned_elements << ", token_per_expert=" << token_per_expert << std::endl; + << ", unaligned_elements=" << unaligned_elements << ", token_per_expert=" << token_per_expert << ", block_size = " << block_size + << std::endl; ret_event = execute_stage(events, instance, *prefill_gather, @@ -1305,9 +1315,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(local_threads_count), 1, 1}); stream.finish(); //debug - print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); - print_mem(stream, intermediates_memories[12], "token idx per expert"); - print_mem_f16(stream, scratch.x, "gathered token"); + // print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); + // print_mem(stream, intermediates_memories[12], "token idx per expert"); + // print_mem_f16(stream, scratch.x, "gathered token"); std::cout << std::endl; } @@ -1329,9 +1339,26 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_up); ret_event->wait(); //debug stream.finish(); //debug + + { + print_mem_f16(stream, intermediates_memories[4], "up_token_input"); + print_mem(stream, intermediates_memories[9], "up_expert_id", num_actually_used_experts); + print_mem(stream, intermediates_memories[10], "up_input_offset_per_expert", num_actually_used_experts); + print_mem(stream, intermediates_memories[11], "up_token_len", num_actually_used_experts); + print_mem_f16(stream, intermediates_memories[2], "up_output"); + } + ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_gate); ret_event->wait(); //debug stream.finish(); //debug + + { + // print_mem_f16(stream, intermediates_memories[4], "gate_token_input"); + // print_mem(stream, intermediates_memories[9], "gate_expert_id", num_actually_used_experts); + // print_mem(stream, intermediates_memories[10], "gate_input_offset_per_expert", num_actually_used_experts); + // print_mem(stream, intermediates_memories[11], "gate_token_len", num_actually_used_experts); + print_mem_f16(stream, intermediates_memories[6], "gate_output"); + } } // step 4: post proc - gate_up = silu(gate)*up, silu(x)=x*sigmod(x)=x*(1+exp(-x)) @@ -1356,6 +1383,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { ret_event->wait(); //debug stream.finish(); //debug + + { + print_mem_f16(stream, intermediates_memories[2], "silu_up_input"); + print_mem_f16(stream, intermediates_memories[6], "silu_gate_up_output"); + } } // step 5: moe_gemm for down @@ -1377,6 +1409,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_down); ret_event->wait(); //debug stream.finish(); //debug + + { + print_mem_f16(stream, intermediates_memories[6], "down_token_input"); + // print_mem(stream, intermediates_memories[9], "down_expert_id", num_actually_used_experts); + // print_mem(stream, intermediates_memories[10], "down_input_offset_per_expert", num_actually_used_experts); + // print_mem(stream, intermediates_memories[11], "down_token_len", num_actually_used_experts); + print_mem_f16(stream, intermediates_memories[3], "down_output"); + } } // step 6: scatter and reduce @@ -1417,6 +1457,16 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { ret_event->wait(); //debug stream.finish(); //debug + { + print_mem_f16(stream, intermediates_memories[3], "scatter_reduce_input"); + print_mem(stream, batch_mem_ptr, "scatter_reduce_experts_per_token"); + print_mem_f16(stream, routing_mem_ptr, "scatter_reduce_expert_weights"); + print_mem(stream, intermediates_memories[12], "scatter_reduce_tokens_per_expert"); + print_mem(stream, intermediates_memories[10], "scatter_reduce_experts_start_offset", num_actually_used_experts); + print_mem(stream, intermediates_memories[11], "scatter_reduce_tokens_len_per_expert", num_actually_used_experts); + print_mem(stream, intermediates_memories[9], "scatter_reduce_expert_id", num_actually_used_experts); + print_mem_f16(stream, final_hidden_states_mem_ptr, "final_hidden_states"); + } } return ret_event; @@ -1625,6 +1675,16 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {1, lws_size}, instance.needs_completion_event()); + { + // debug print + std::cout << "expert_no=" << expert_no << ", n_token=" << n_token << ", hidden_size=" << _hidden_size + << ", intermediate_size=" << _intermediate_size << std::endl; + stream.finish(); //debug + print_mem_f16(stream, hidden_states_mem_ptr, "input_token"); + print_mem_f16(stream, scratch.x, "gathered_token", n_token); + print_mem_f16(stream, scratch.routing_weights, "routing_weights"); + } + // up kernel.up.forward(dnn_stream, n_token, @@ -1632,6 +1692,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), dnnl::memory()); + { + // debug print + stream.finish(); //debug + print_mem_f16(stream, scratch.up, "up_output", n_token); + } + // gate kernel.gate.forward(dnn_stream, n_token, @@ -1639,12 +1705,24 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab)); + { + // debug print + stream.finish(); //debug + print_mem_f16(stream, scratch.gate, "gate_up_output", n_token); + } + // down kernel.down.forward(dnn_stream, n_token, convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.y, {static_cast(n_token), _hidden_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.routing_weights, {n_token * max_topk}, dnnl::memory::format_tag::a)); + + { + // debug print + stream.finish(); //debug + print_mem_f16(stream, scratch.y, "down_with_weights_output", n_token); + } // index_add result_event = execute_stage({result_event}, instance, @@ -1654,6 +1732,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(n_token), static_cast(_hidden_size)}, {1, lws_size}, instance.needs_completion_event()); + + { + // debug print + stream.finish(); //debug + print_mem_f16(stream, final_hidden_states_mem_ptr, "final_output"); + } } return result_event; From 3b943869da7f0c76bb18a378e30d230c02f3ffac Mon Sep 17 00:00:00 2001 From: River Date: Tue, 25 Nov 2025 23:56:31 +0800 Subject: [PATCH 05/20] Success to run gs=-1 mode model --- .../src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 4 ++-- .../src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 8 ++++---- .../src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index 349c019ef63c79..b32f44de713769 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -200,7 +200,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, const auto& zp_layout = params.get_input_layout(zp_idx); const auto zp_dt = convert_type(zp_layout.data_type); problem_moe.Tao = zp_dt; - problem_moe.AO.setAlignment(zp_dt == gemmstone::Type::u4 ? 1 : static_cast(zp_dt.size())); + problem_moe.AO.setAlignment(zp_dt == micro::Type::u4 ? 1 : static_cast(zp_dt.size())); problem_moe.AO.layout = micro::MatrixLayout::N; problem_moe.aoPtrDims = static_cast(MICRO_DIMENSIONALITY::MATRIX); // Calculate A/B row/column sums in kernel. @@ -238,7 +238,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, } DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { const auto wei_idx = this->m_wei_idx; - return DispatchDataFunc{[this, wei_idx](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { + return DispatchDataFunc{[wei_idx](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { assert(!params.is_dynamic()); std::cout << "MoE3GemmMicroGenerator::DispatchDataFunc()" << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 7477ca0a1f79b6..c2a12a1b946a0d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -504,7 +504,7 @@ class MoE3GemmSwigluPrefillSwiglu : public KernelGenerator { jit.make("PREFILL_SWIGLU_ENABLE", 1); jit.make("SUBGROUP_SIZE", info.arch >= gpu_arch::xe2 ? 32 : 16); - jit.make("HIDDEN_SIZE", desc->_config.hidden_size); + jit.make("INTERMEDIA_SIZE", desc->_config.inter_size); jit.make("MOE_DTYPE", "half"); return jit; } @@ -805,8 +805,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(prefill_gather, params); add_stage(micro_gemm_gate, params); add_stage(micro_gemm_up, params); - add_stage(micro_gemm_down, params); add_stage(prefill_swiglu, params); + add_stage(micro_gemm_down, params); add_stage(prefill_scatter_reduce, params); } } @@ -1371,14 +1371,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); auto token_size = input_shape[0] * max_topk; - std::cout << "\nstep 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _hidden_size << std::endl; + std::cout << "\nstep 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _intermediate_size << std::endl; ret_event = execute_stage({ret_event}, instance, *prefill_swiglu, {intermediates_memories[2], intermediates_memories[6]}, {intermediates_memories[6]}, - {static_cast(token_size), static_cast(_hidden_size), 1}, + {static_cast(token_size), static_cast(_intermediate_size), 1}, {1, subgroup_size, 1}); ret_event->wait(); //debug diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl index ef5260746b75ea..0ab78a37340669 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl @@ -142,7 +142,7 @@ KERNEL(swiglu_ref) ( const uint token_idx = get_global_id(0); const uint n_offset = get_global_id(1); - const uint offset = token_idx * HIDDEN_SIZE + n_offset; + const uint offset = token_idx * INTERMEDIA_SIZE + n_offset; #if MOE_DTYPE_SIZE == 2 half up_value = as_half(intel_sub_group_block_read_us((const __global ushort *)(up + offset))); half gate_value = as_half(intel_sub_group_block_read_us((const __global ushort *)(gate + offset))); From 3a9e5d537f8c26e9e4bac5517c5231cfe31183ba Mon Sep 17 00:00:00 2001 From: River Date: Thu, 27 Nov 2025 00:00:34 +0800 Subject: [PATCH 06/20] Add scale/zp repack --- .../graph/impls/ocl_v2/moe/moe_3gemm_base.hpp | 3 + .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 51 ++- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 361 ++++++++++++++---- .../impls/ocl_v2/moe_3gemm_swiglu_fuse.cl | 100 +++++ 4 files changed, 433 insertions(+), 82 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp index ea9de9677947ec..c7a2d26b6f2924 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp @@ -38,4 +38,7 @@ struct moe_3gemm_config { bool has_batch_dim = false; // 0 - pa, 1 - non-pa }; +struct MoE3GemmRuntimeParams : public MoEGemmRuntimeParams { +}; + } // namespace ov::intel_gpu::ocl \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index b32f44de713769..216c7f38b0c879 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -61,15 +61,15 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); std::cout << "\t m_wei_idx: " << m_wei_idx << std::endl; - std::cout << "\t params.input_layouts[m_wei_idx].get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; + std::cout << "\t m_wei_idx.get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; const auto& weight_shape = params.input_layouts[m_wei_idx].get_shape(); // u4:bfyx:4x3072x8x128:nopad size_t expert_stride = weight_shape.size() == 4 ? (weight_shape[1] * weight_shape[2] * weight_shape[3]) : (weight_shape[1] * weight_shape[2]); jit.make("EXPERT_STRIDE", expert_stride / 2); - std::cout << "\t expert_stride: " << expert_stride / 2 << std::endl; + // std::cout << "\t expert_stride: " << expert_stride / 2 << std::endl; std::cout << "\t m_scale_idx: " << m_scale_idx << std::endl; - std::cout << "\t params.input_layouts[m_scale_idx].get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; + std::cout << "\t m_scale_idx.get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; if (cfg.weight_group_size > 0) { jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[1]); std::cout << "\t NUM_GROUPS: " << params.input_layouts[m_scale_idx].get_shape()[1] << std::endl; @@ -85,14 +85,14 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& // f16:bfyx:[?,2048]:nopad jit.make("INPUT_STRIDE", desc->_config.hidden_size); jit.make("OUTPUT_STRIDE", desc->_config.inter_size); - std::cout << "\t INPUT_STRIDE: " << desc->_config.hidden_size << std::endl; - std::cout << "\t OUTPUT_STRIDE: " << desc->_config.inter_size << std::endl; + // std::cout << "\t INPUT_STRIDE: " << desc->_config.hidden_size << std::endl; + // std::cout << "\t OUTPUT_STRIDE: " << desc->_config.inter_size << std::endl; break; case MoE3GemmMicroKernelType::MLP_DOWN: jit.make("INPUT_STRIDE", desc->_config.inter_size); jit.make("OUTPUT_STRIDE", desc->_config.hidden_size); - std::cout << "\t INPUT_STRIDE: " << desc->_config.inter_size << std::endl; - std::cout << "\t OUTPUT_STRIDE: " << desc->_config.hidden_size << std::endl; + // std::cout << "\t INPUT_STRIDE: " << desc->_config.inter_size << std::endl; + // std::cout << "\t OUTPUT_STRIDE: " << desc->_config.hidden_size << std::endl; break; default: OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); @@ -100,11 +100,11 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& } auto slm_size = moe_gemm.getSetting("slm_size"); - std::cout << "MoE3GemmMicroGenerator::get_jit_constants() slm_size: " << slm_size << std::endl; + // std::cout << "MoE3GemmMicroGenerator::get_jit_constants() slm_size: " << slm_size << std::endl; if (slm_size > 0) jit.make("USE_SLM", 1); - std::cout << "MoE3GemmMicroGenerator::get_jit_constants() done " << std::endl; + // std::cout << "MoE3GemmMicroGenerator::get_jit_constants() done " << std::endl; return jit; } @@ -234,7 +234,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, } catch (const std::runtime_error& ex) { OPENVINO_THROW("Can't create moe micro kernel: ", ex.what()); } - std::cout << "init_microkernels is done" << std::endl; + // std::cout << "init_microkernels is done" << std::endl; } DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { const auto wei_idx = this->m_wei_idx; @@ -301,6 +301,10 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p // if (params.is_dynamic()) // args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); // auto cfg = get_moe_cfg(params); + auto desc = params.typed_desc(); + auto need_repack = desc->_config.group_size != std::numeric_limits::max(); + + std::cout << "MoE3GemmMicroGenerator::get_arguments_desc() need_repack: " << need_repack << std::endl; switch (m_type) { case MoE3GemmMicroKernelType::MLP_GATE: @@ -312,8 +316,13 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_0)}); // zp + if(need_repack) { + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 13}); // repacked scale buffer + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 14}); // repacked zp buffer + } else { + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_0)}); // zp + } break; case MoE3GemmMicroKernelType::MLP_UP: args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor @@ -324,8 +333,13 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_1)}); // zp + if(need_repack) { + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 15}); // repacked scale buffer + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 16}); // repacked zp buffer + } else { + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_1)}); // zp + } break; case MoE3GemmMicroKernelType::MLP_DOWN: args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // intermediate_mem[6] @@ -336,8 +350,13 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_2)}); // zp + if(need_repack) { + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 17}); // repacked scale buffer + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 18}); // repacked zp buffer + } else { + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_2)}); // zp + } break; default: OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index c2a12a1b946a0d..b9f0e03e6784e4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -7,6 +7,9 @@ #include "moe_3gemm_swiglu_opt.hpp" // clang-format on +#define DUMP_TENSOR_CONTENTS 0 +#define DEBUG_MOE_LOG 0 + #ifdef ENABLE_ONEDNN_FOR_GPU # include # include @@ -363,7 +366,6 @@ class MoE3GemmSwigluSoftMaxTopK : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluSoftMaxTopK::get_dispatch_data_func()" << std::endl; }}; } }; @@ -394,7 +396,6 @@ class MoE3GemmSwigluGather : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluGather::get_dispatch_data_func()" << std::endl; }}; } }; @@ -486,7 +487,35 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluPrefillGather::get_dispatch_data_func()" << std::endl; + }}; + } +}; + +class MoE3GemmSwigluPrefillRepack : public KernelGenerator { +public: + MoE3GemmSwigluPrefillRepack() : KernelGenerator("moe_3gemm_swiglu_fuse", "prefill_repack") {} + +protected: + [[nodiscard]] JitConstants get_jit_constants(const RuntimeParams& params) const override { + auto jit = KernelGenerator::get_jit_constants(params); + auto desc = params.typed_desc(); + auto& engine = params.prog->get_engine(); + const auto& info = engine.get_device_info(); + + jit.make("PREFILL_SCALE_ZP_REPACK", 1); + jit.make("SUBGROUP_SIZE", info.arch >= gpu_arch::xe2 ? 32 : 16); + jit.make("INTERMEDIA_SIZE", desc->_config.inter_size); + return jit; + } + + [[nodiscard]] Arguments get_arguments_desc(const RuntimeParams& params) const override { + Arguments args; + + return args; + } + + [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { }}; } }; @@ -517,7 +546,6 @@ class MoE3GemmSwigluPrefillSwiglu : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluPrefillSwiglu::get_dispatch_data_func()" << std::endl; }}; } }; @@ -564,7 +592,6 @@ class MoE3GemmSwigluPrefillScatterReduce : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluPrefillScatterReduce::get_dispatch_data_func()" << std::endl; }}; } }; @@ -591,7 +618,6 @@ class MoE3GemmSwigluScatter : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluScatter::get_dispatch_data_func()" << std::endl; }}; } }; @@ -646,7 +672,6 @@ class MoE3GemmSwigluMLPGateUp : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluMLPGateUp::get_dispatch_data_func()" << std::endl; }}; } }; @@ -671,7 +696,6 @@ class MoE3GemmSwigluMLPDown : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluMLPDown::get_dispatch_data_func()" << std::endl; }}; } }; @@ -696,7 +720,6 @@ class MoE3GemmSwigluMLPReduce : public KernelGenerator { [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - std::cout << "MoE3GemmSwigluMLPReduce::get_dispatch_data_func()" << std::endl; }}; } }; @@ -706,7 +729,8 @@ dnnl::memory convert2dnnl(const memory::ptr& ptr, const std::vector& di return ptr->get_onednn_memory(dnnl::memory::desc(dnnl::memory::dims(dim), convert_data_type(ptr->get_layout().data_type), tag), offset); } -static bool use_micro_gemm_prefill = true; +static bool use_micro_gemm_prefill; +static bool need_repack_scale_zp; class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { public: DECLARE_OBJECT_TYPE_SERIALIZATION(ov::intel_gpu::ocl::MoE3GemmSwigluImpl) @@ -723,6 +747,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { Stage::Ptr micro_gemm_down = make_stage(MoE3GemmMicroKernelType::MLP_DOWN); Stage::Ptr prefill_swiglu = make_stage(); Stage::Ptr prefill_scatter_reduce = make_stage(); + Stage::Ptr prefill_scale_zp_repack = make_stage(); struct dnnl_weights { dnnl::memory weight; @@ -785,6 +810,10 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { moe_3gemm_swiglu_opt_impl() : PrimitiveImplOCL(moe_3gemm_swiglu_opt::get_type_info_static()) {} moe_3gemm_swiglu_opt_impl(const program_node& node, const RuntimeParams& params) : moe_3gemm_swiglu_opt_impl() { + if (m_rt_params == nullptr) { + m_rt_params = std::make_unique(); + } + // auto rtp = static_cast(m_rt_params.get()); init(node.as().get_primitive()); add_stage(softmax_topk, params); @@ -796,11 +825,23 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto use_micro_gemm_prefill_str = std::getenv("MOE_USE_MICRO_GEMM_PREFILL"); if (use_micro_gemm_prefill_str) { - std::cout << "MOE_USE_MICRO_GEMM_PREFILL = " << use_micro_gemm_prefill_str << std::endl; + GPU_DEBUG_TRACE_DETAIL << "MOE_USE_MICRO_GEMM_PREFILL = " << use_micro_gemm_prefill_str << std::endl; use_micro_gemm_prefill = std::stoi(use_micro_gemm_prefill_str); } else { use_micro_gemm_prefill = true; } + + auto& engine = params.prog->get_engine(); + const auto& info = engine.get_device_info(); + if (info.arch < gpu_arch::xe2) { + use_micro_gemm_prefill = false; + GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt_impl(): use_micro_gemm_prefill=" << use_micro_gemm_prefill + << ", arch=" << static_cast(info.arch) << std::endl; + } else { + GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt_impl(): use_micro_gemm_prefill=" << use_micro_gemm_prefill + << ", arch=" << static_cast(info.arch) << std::endl; + } + if (use_micro_gemm_prefill) { add_stage(prefill_gather, params); add_stage(micro_gemm_gate, params); @@ -808,6 +849,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(prefill_swiglu, params); add_stage(micro_gemm_down, params); add_stage(prefill_scatter_reduce, params); + add_stage(prefill_scale_zp_repack, params); } } @@ -816,10 +858,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { _intermediate_size = static_cast(cur_moe->_config.inter_size); _gate_up_group_size = static_cast(cur_moe->_config.group_size); _down_group_size = static_cast(cur_moe->_config.group_size); + // auto rtp = static_cast(m_rt_params.get()); if (cur_moe->_config.group_size == std::numeric_limits::max()) { _gate_up_group_size = static_cast(cur_moe->_config.hidden_size); _down_group_size = static_cast(cur_moe->_config.inter_size); + need_repack_scale_zp = false; + } else { + need_repack_scale_zp = true; } GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt prefill: group_size=" << cur_moe->_config.group_size @@ -888,24 +934,29 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::vector get_internal_buffer_descs(const kernel_impl_params& params) const override { auto cur_moe = params.typed_desc(); const auto& config = cur_moe->_config; - int max_topk = static_cast(config.top_k); - int expert_num = static_cast(config.num_expert); + size_t max_topk = static_cast(config.top_k); + size_t expert_num = static_cast(config.num_expert); auto hidden_states_layout = params.input_layouts[0]; - auto batch = static_cast(hidden_states_layout.get_shape()[0]); + auto batch = static_cast(hidden_states_layout.get_shape()[0]); auto data_type = hidden_states_layout.data_type; + if (cur_moe->_config.group_size == std::numeric_limits::max()) { + need_repack_scale_zp = false; + } else { + need_repack_scale_zp = true; + } std::vector internal_buffers; // softmax+topk - layout layout_topk_id(ov::PartialShape{batch, max_topk}, data_types::u32, cldnn::format::bfyx); - layout layout_topk_weights(ov::PartialShape{batch, max_topk}, data_type, cldnn::format::bfyx); + layout layout_topk_id(ov::Shape{batch, max_topk}, data_types::u32, cldnn::format::bfyx); + layout layout_topk_weights(ov::Shape{batch, max_topk}, data_type, cldnn::format::bfyx); internal_buffers.emplace_back(layout_topk_id, true); // 0: topk_id internal_buffers.emplace_back(layout_topk_weights, true); // 1: topk_weights // fast single batch: scratch.up = up(x) * silu(gate(x)); scratch.y = down(scratch.up) * weight[expert_no] // To support micro_gemm, prefill need to allocate max_topk * batch for input data of micro_gemm auto max_batch = max_topk * batch; - layout layout_gateup_out(ov::PartialShape{max_batch, static_cast(config.inter_size)}, data_type, cldnn::format::bfyx); - layout layout_down_out(ov::PartialShape{max_batch, static_cast(config.hidden_size)}, data_type, cldnn::format::bfyx); + layout layout_gateup_out(ov::Shape{max_batch, static_cast(config.inter_size)}, data_type, cldnn::format::bfyx); + layout layout_down_out(ov::Shape{max_batch, static_cast(config.hidden_size)}, data_type, cldnn::format::bfyx); internal_buffers.emplace_back(layout_gateup_out, true); // 2: up output internal_buffers.emplace_back(layout_down_out, true); // 3: down output // onednn: scratch.x, scratch.routing_weights = gather(x, ...) @@ -913,21 +964,43 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // scratch.gate = gate(scratch.x) * scratch.up // scratch.y = down(scratch.gate) * routing_weights internal_buffers.emplace_back(layout_down_out, true); // 4: up/gate input, scratch.x has same layout with down output - layout routing_layout(ov::PartialShape{batch * max_topk}, data_type, cldnn::format::bfyx); + layout routing_layout(ov::Shape{batch * max_topk}, data_type, cldnn::format::bfyx); internal_buffers.emplace_back(routing_layout, true); // 5: routing_weights internal_buffers.emplace_back(layout_gateup_out, true); // 6: gate output, scratch.gate has same layout with up // expert masks for gpu - layout index_layout(ov::PartialShape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); + layout index_layout(ov::Shape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); internal_buffers.emplace_back(index_layout, true); // 7: batch internal_buffers.emplace_back(index_layout, true); // 8: topk + GPU_DEBUG_TRACE_DETAIL << "[DEBUG] get_internal_buffer_descs(): use_micro_gemm_prefill=" << use_micro_gemm_prefill + << ", need_repack_scale_zp=" << need_repack_scale_zp << std::endl; // for micro_gemm - layout layout_micro_gemm(ov::PartialShape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); - internal_buffers.emplace_back(layout_micro_gemm, true); // 9: experts_ids for each activated expert - internal_buffers.emplace_back(layout_micro_gemm, true); // 10: token start offset idx (input gather tokens) for each activated expert - internal_buffers.emplace_back(layout_micro_gemm, true); // 11: token len (input gather tokens) for each activated expert - layout layout_token_idx(ov::PartialShape{batch * max_topk}, ov::element::i32, cldnn::format::bfyx); - internal_buffers.emplace_back(layout_token_idx, true); // 12: token idx per expert + if(use_micro_gemm_prefill && batch > 1) { + layout layout_micro_gemm(ov::Shape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); + internal_buffers.emplace_back(layout_micro_gemm, true); // 9: experts_ids for each activated expert + internal_buffers.emplace_back(layout_micro_gemm, true); // 10: token start offset idx (input gather tokens) for each activated expert + internal_buffers.emplace_back(layout_micro_gemm, true); // 11: token len (input gather tokens) for each activated expert + layout layout_token_idx(ov::Shape{batch * max_topk}, ov::element::i32, cldnn::format::bfyx); + internal_buffers.emplace_back(layout_token_idx, true); // 12: token idx per expert + + // repack scale/zp + if(need_repack_scale_zp) { + auto gate_up_scale_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::SCALE_0)].get_shape(); + layout gate_up_scale_layout(ov::Shape{expert_num, gate_up_scale_shape[2], gate_up_scale_shape[1]}, ov::element::f16, cldnn::format::bfyx); + auto gate_up_zp_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::ZP_0)].get_shape(); + layout gate_up_zp_layout(ov::Shape{expert_num, gate_up_zp_shape[2], gate_up_zp_shape[1]}, ov::element::u4, cldnn::format::bfyx); + internal_buffers.emplace_back(gate_up_scale_layout, true); // 13: gate_scale + internal_buffers.emplace_back(gate_up_zp_layout, true); // 14: gate_zp + internal_buffers.emplace_back(gate_up_scale_layout, true); // 15: up_scale + internal_buffers.emplace_back(gate_up_zp_layout, true); // 16: up_zp + auto down_scale_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::SCALE_2)].get_shape(); + layout down_scale_layout(ov::Shape{expert_num, down_scale_shape[2], down_scale_shape[1]}, ov::element::f16, cldnn::format::bfyx); + auto down_zp_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::ZP_2)].get_shape(); + layout down_zp_layout(ov::Shape{expert_num, down_zp_shape[2], down_zp_shape[1]}, ov::element::u4, cldnn::format::bfyx); + internal_buffers.emplace_back(down_scale_layout, true); // 17: down_scale + internal_buffers.emplace_back(down_zp_layout, true); // 18: down_zp + } + } return internal_buffers; } @@ -1050,11 +1123,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { cldnn::kernel_arguments_data args; cldnn::kernel_arguments_desc desc; - std::cout << "moe::execute_stage: " << stage.kernel->get_id() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "moe::execute_stage: " << stage.kernel->get_id() << std::endl; for (uint32_t i = 0; i < inputs.size(); i++) { desc.arguments.push_back({ArgumentDescriptor::Types::INPUT, i}); args.inputs.push_back(inputs[i]); - std::cout << "\tinput[" << i << "]: " << inputs[i]->get_layout().to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\tinput[" << i << "]: " << inputs[i]->get_layout().to_short_string() << std::endl; } cldnn::scalars_desc scalar_desc; if (!scalar_inputs.empty()) { @@ -1065,17 +1138,17 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { scalar_desc[i].v.s32 = scalar_inputs[i]; } args.scalars = &scalar_desc; - std::cout << "\tscalar_inputs: "; + GPU_DEBUG_TRACE_DETAIL << "\tscalar_inputs: "; for (const auto& scalar : scalar_inputs) { - std::cout << scalar << " "; + GPU_DEBUG_TRACE_DETAIL << scalar << " "; } - std::cout << std::endl; + GPU_DEBUG_TRACE_DETAIL << std::endl; } for (uint32_t i = 0; i < outputs.size(); i++) { desc.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, i}); args.outputs.push_back(outputs[i]); - std::cout << "\toutput[" << i << "]: " << outputs[i]->get_layout().to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\toutput[" << i << "]: " << outputs[i]->get_layout().to_short_string() << std::endl; } stream.set_arguments(*stage.kernel, desc, args); @@ -1083,11 +1156,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { desc.workGroups.local = local; if (global.size() == 2) { - std::cout << "\tgws = {" << global[0] << ", " << global[1] << "}" << std::endl; - std::cout << "\tlws = {" << local[0] << ", " << local[1] << "}" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\tgws = {" << global[0] << ", " << global[1] << "}" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\tlws = {" << local[0] << ", " << local[1] << "}" << std::endl; } else if (global.size() == 3) { - std::cout << "\tgws = {" << global[0] << ", " << global[1] << ", " << global[2] << "}" << std::endl; - std::cout << "\tlws = {" << local[0] << ", " << local[1] << ", " << local[2] << "}" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\tgws = {" << global[0] << ", " << global[1] << ", " << global[2] << "}" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\tlws = {" << local[0] << ", " << local[1] << ", " << local[2] << "}" << std::endl; } return stream.enqueue_kernel(*stage.kernel, desc, {}, events, needs_completion_event); @@ -1166,10 +1239,33 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { return ret; } - void print_mem_f16(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, size_t max_row = 1024) { +#if DUMP_TENSOR_CONTENTS + void print_mem_f16(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, size_t max_row = 50) { auto layout = mem->get_layout().get_shape(); - size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; - size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; + size_t row = 0; + size_t col = 0; + + switch(layout.size()) { + case 1: + row = 1; + col = layout[0]; + break; + case 2: + row = layout[0]; + col = layout[1]; + break; + case 3: + row = layout[0] * layout[1]; + col = layout[2]; + break; + case 4: + row = layout[0] * layout[1] * layout[2]; + col = layout[3]; + break; + default: + OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); + } + cldnn::mem_lock lock_data{mem, stream}; std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; for (size_t j = 0; j < row && j < max_row; j++) { @@ -1183,6 +1279,46 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::cout << std::endl; }; + void print_mem_u4(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, size_t max_row = 50) { + auto layout = mem->get_layout().get_shape(); + size_t row = 0; + size_t col = 0; + + switch(layout.size()) { + case 1: + row = 1; + col = layout[0]; + break; + case 2: + row = layout[0]; + col = layout[1]; + break; + case 3: + row = layout[0] * layout[1]; + col = layout[2]; + break; + case 4: + row = layout[0] * layout[1] * layout[2]; + col = layout[3]; + break; + default: + OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); + } + + col = col / 2; //u4 + cldnn::mem_lock lock_data{mem, stream}; + std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; + for (size_t j = 0; j < row && j < max_row; j++) { + std::cout << "\t[" << j << "]: "; + for (size_t i = 0; i < col && i < 16; i++) { + uint8_t byte_val = lock_data[j * col + i]; + std::cout << (byte_val & 0xF) << ", " << ((byte_val >> 4) & 0xF) << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + }; + void print_mem(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, int max_print = 1024) { auto layout = mem->get_layout().get_shape(); size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; @@ -1203,6 +1339,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } std::cout << std::endl; }; +#endif cldnn::event::ptr exec_prefill_opt(const std::vector& events, typed_primitive_inst& instance, @@ -1219,6 +1356,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { _hidden_size = static_cast(cur_moe->_config.hidden_size); _intermediate_size = static_cast(cur_moe->_config.inter_size); + auto rtp = static_cast(m_rt_params.get()); const size_t subgroup_size = instance.get_impl_params()->get_device_info().arch >= gpu_arch::xe2 ? 32 : 16; // const size_t max_work_group_size = instance.get_impl_params()->get_device_info().max_work_group_size; @@ -1255,10 +1393,10 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } } } - auto rtp = static_cast(m_rt_params.get()); rtp->num_actually_used_experts = num_actually_used_experts; // debug print + #if DEBUG_MOE_LOG { std::cout << "\nstep 1: prefill_mask num_actually_used_experts=" << num_actually_used_experts << std::endl; std::cout << "expert_id[" << num_actually_used_experts << "]: = "; @@ -1288,6 +1426,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } std::cout << std::endl; } + #endif } // step 2: generate gather input tokens @@ -1296,16 +1435,18 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 1: token idx per expert, static shape = [token_num * topK_num] // output // 0: gathered token: shape = [token_len * expert_topK, hidden_size] - if(1){ + { auto hidden_size = _hidden_size; auto block_size = GetBlockSize(*instance.get_impl_params()); auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(*instance.get_impl_params()), block_size, hidden_size); auto token_per_expert = intermediates_memories[12]->get_layout().get_shape()[0]; + #if DEBUG_MOE_LOG std::cout << "\nstep 2: prefill_gather local_threads_count=" << local_threads_count << ", batches_per_thread=" << batches_per_thread << ", unaligned_elements=" << unaligned_elements << ", token_per_expert=" << token_per_expert << ", block_size = " << block_size << std::endl; + #endif ret_event = execute_stage(events, instance, *prefill_gather, @@ -1314,11 +1455,74 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(token_per_expert * local_threads_count), 1, 1}, {static_cast(local_threads_count), 1, 1}); - stream.finish(); //debug - // print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); - // print_mem(stream, intermediates_memories[12], "token idx per expert"); - // print_mem_f16(stream, scratch.x, "gathered token"); - std::cout << std::endl; + #if DUMP_TENSOR_CONTENTS + { + stream.finish(); //debug + // print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); + // print_mem(stream, intermediates_memories[12], "token idx per expert"); + // print_mem_f16(stream, scratch.x, "gathered token"); + std::cout << std::endl; + } + #endif + } + + // step 2.5: repack scale/zp if needed + if (need_repack_scale_zp) { + #if DEBUG_MOE_LOG + std::cout << "\nstep 2.5: repack scale/zp for moe_gemm" << std::endl; + #endif + // gate + ret_event = execute_stage(events, + instance, + *prefill_scale_zp_repack, + { + instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_0)), + instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_0)), + }, + { + intermediates_memories[13], + intermediates_memories[14], + }, + {static_cast(num_total_experts), static_cast(_hidden_size/_gate_up_group_size), static_cast(_intermediate_size/2)}, + {1, 1, subgroup_size}); + // up + ret_event = execute_stage(events, + instance, + *prefill_scale_zp_repack, + { + instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)), + instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)), + }, + { + intermediates_memories[15], + intermediates_memories[16], + }, + {static_cast(num_total_experts), static_cast(_hidden_size/_gate_up_group_size), static_cast(_intermediate_size/2)}, + {1, 1, subgroup_size}); + // down + ret_event = execute_stage(events, + instance, + *prefill_scale_zp_repack, + { + instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_2)), + instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_2)), + }, + { + intermediates_memories[17], + intermediates_memories[18], + }, + {static_cast(num_total_experts), static_cast(_intermediate_size/_down_group_size), static_cast(_hidden_size/2)}, + {1, 1, subgroup_size}); + + #if DUMP_TENSOR_CONTENTS + { + stream.finish(); //debug + print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)), "up_scale_1"); + print_mem_f16(stream, intermediates_memories[15], "repack_up_scale"); + print_mem_u4(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)), "up_zp_1"); + print_mem_u4(stream, intermediates_memories[16], "repack_up_zp"); + } + #endif } // step 3: moe_gemm for up and gate @@ -1334,31 +1538,35 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 8: wei_zp // output: // 0: up/gate output, shape = [token_len * expert_topK, hidden_size] - if(1){ + { + #if DEBUG_MOE_LOG std::cout << "\nstep 3: moe_gemm for up and gate" << std::endl; + #endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_up); - ret_event->wait(); //debug - stream.finish(); //debug + #if DUMP_TENSOR_CONTENTS { + stream.finish(); //debug print_mem_f16(stream, intermediates_memories[4], "up_token_input"); print_mem(stream, intermediates_memories[9], "up_expert_id", num_actually_used_experts); print_mem(stream, intermediates_memories[10], "up_input_offset_per_expert", num_actually_used_experts); print_mem(stream, intermediates_memories[11], "up_token_len", num_actually_used_experts); print_mem_f16(stream, intermediates_memories[2], "up_output"); } + #endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_gate); - ret_event->wait(); //debug - stream.finish(); //debug + #if DUMP_TENSOR_CONTENTS { + stream.finish(); //debug // print_mem_f16(stream, intermediates_memories[4], "gate_token_input"); // print_mem(stream, intermediates_memories[9], "gate_expert_id", num_actually_used_experts); // print_mem(stream, intermediates_memories[10], "gate_input_offset_per_expert", num_actually_used_experts); // print_mem(stream, intermediates_memories[11], "gate_token_len", num_actually_used_experts); print_mem_f16(stream, intermediates_memories[6], "gate_output"); } + #endif } // step 4: post proc - gate_up = silu(gate)*up, silu(x)=x*sigmod(x)=x*(1+exp(-x)) @@ -1371,7 +1579,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); auto token_size = input_shape[0] * max_topk; + #if DEBUG_MOE_LOG std::cout << "\nstep 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _intermediate_size << std::endl; + #endif ret_event = execute_stage({ret_event}, instance, @@ -1381,13 +1591,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(token_size), static_cast(_intermediate_size), 1}, {1, subgroup_size, 1}); - ret_event->wait(); //debug - stream.finish(); //debug - + #if DUMP_TENSOR_CONTENTS { + ret_event->wait(); //debug + stream.finish(); //debug print_mem_f16(stream, intermediates_memories[2], "silu_up_input"); print_mem_f16(stream, intermediates_memories[6], "silu_gate_up_output"); } + #endif } // step 5: moe_gemm for down @@ -1403,20 +1614,22 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 8: wei_zp // output: // 0: down output, shape = [token_len * expert_topK, hidden_size] - - if(1){ + { + #if DEBUG_MOE_LOG std::cout << "\nstep 5: moe_gemm for down" << std::endl; + #endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_down); - ret_event->wait(); //debug - stream.finish(); //debug + #if DUMP_TENSOR_CONTENTS { + stream.finish(); //debug print_mem_f16(stream, intermediates_memories[6], "down_token_input"); // print_mem(stream, intermediates_memories[9], "down_expert_id", num_actually_used_experts); // print_mem(stream, intermediates_memories[10], "down_input_offset_per_expert", num_actually_used_experts); // print_mem(stream, intermediates_memories[11], "down_token_len", num_actually_used_experts); print_mem_f16(stream, intermediates_memories[3], "down_output"); } + #endif } // step 6: scatter and reduce @@ -1430,14 +1643,15 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 6: expert id, dynamic shape = [activated_expert_num] // output: // 0: final hidden states, shape = [token_len, hidden_size] - - if(1){ + { auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); auto token_size = input_shape[0] * max_topk; auto [local_threads_count, batches_per_thread, _] = calc_thread_count(const_cast(*instance.get_impl_params()), 4, _hidden_size); + #if DEBUG_MOE_LOG std::cout << "\nstep 6: prefill_scatter_reduce token_size=" << token_size << ", local_threads_count=" << local_threads_count << ", num_actually_used_experts = " << num_actually_used_experts << std::endl; + #endif ret_event = execute_stage({ret_event}, instance, @@ -1455,9 +1669,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { instance.needs_completion_event(), {num_actually_used_experts}); - ret_event->wait(); //debug - stream.finish(); //debug + #if DUMP_TENSOR_CONTENTS { + stream.finish(); //debug print_mem_f16(stream, intermediates_memories[3], "scatter_reduce_input"); print_mem(stream, batch_mem_ptr, "scatter_reduce_experts_per_token"); print_mem_f16(stream, routing_mem_ptr, "scatter_reduce_expert_weights"); @@ -1467,6 +1681,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { print_mem(stream, intermediates_memories[9], "scatter_reduce_expert_id", num_actually_used_experts); print_mem_f16(stream, final_hidden_states_mem_ptr, "final_hidden_states"); } + #endif } return ret_event; @@ -1474,10 +1689,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { void update_rt_params(const primitive_inst& instance) override { if (m_rt_params == nullptr) { - m_rt_params = std::make_unique(); + m_rt_params = std::make_unique(); } update_stages_flags(instance); - // auto rtp = static_cast(m_rt_params.get()); } void update(primitive_inst& inst, const kernel_impl_params& impl_params) override { @@ -1624,10 +1838,15 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { expert_mask_cpu expert_mask; get_expert_mask_from_gpu(config, topk_id_mem, stream, expert_mask); + GPU_DEBUG_TRACE_DETAIL << "\nMoE3GemmFusedCompressed exec(): batch=" << batch << ", max_topk=" << max_topk + << ", use_micro_gemm_prefill=" << use_micro_gemm_prefill + << ", need_repack_scale_zp=" << need_repack_scale_zp << std::endl; if (use_micro_gemm_prefill) { - std::cout << "\nUse micro_gemm prefill path" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\nUse micro_gemm prefill path" << std::endl; update_rt_params(instance); return exec_prefill_opt({topk_event}, instance, scratch, expert_mask); + } else { + GPU_DEBUG_TRACE_DETAIL << "\nUse onednn path" << std::endl; } auto& dnn_stream = stream.get_onednn_stream(); @@ -1675,6 +1894,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {1, lws_size}, instance.needs_completion_event()); + #if DUMP_TENSOR_CONTENTS { // debug print std::cout << "expert_no=" << expert_no << ", n_token=" << n_token << ", hidden_size=" << _hidden_size @@ -1684,6 +1904,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { print_mem_f16(stream, scratch.x, "gathered_token", n_token); print_mem_f16(stream, scratch.routing_weights, "routing_weights"); } + #endif // up kernel.up.forward(dnn_stream, @@ -1692,11 +1913,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), dnnl::memory()); + #if DUMP_TENSOR_CONTENTS { // debug print stream.finish(); //debug print_mem_f16(stream, scratch.up, "up_output", n_token); } + #endif // gate kernel.gate.forward(dnn_stream, @@ -1705,11 +1928,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab)); + #if DUMP_TENSOR_CONTENTS { // debug print stream.finish(); //debug print_mem_f16(stream, scratch.gate, "gate_up_output", n_token); } + #endif // down kernel.down.forward(dnn_stream, @@ -1718,11 +1943,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.y, {static_cast(n_token), _hidden_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.routing_weights, {n_token * max_topk}, dnnl::memory::format_tag::a)); + #if DUMP_TENSOR_CONTENTS { // debug print stream.finish(); //debug print_mem_f16(stream, scratch.y, "down_with_weights_output", n_token); } + #endif + // index_add result_event = execute_stage({result_event}, instance, @@ -1732,12 +1960,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(n_token), static_cast(_hidden_size)}, {1, lws_size}, instance.needs_completion_event()); - + #if DUMP_TENSOR_CONTENTS { // debug print stream.finish(); //debug print_mem_f16(stream, final_hidden_states_mem_ptr, "final_output"); } + #endif } return result_event; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl index 0ab78a37340669..7138fa370cae8e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl @@ -158,4 +158,104 @@ KERNEL(swiglu_ref) ( #endif } +#elif PREFILL_SCALE_ZP_REPACK +__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) +KERNEL(repack_ref) ( + const __global half* scale_src, + const __global char* zp_src, + __global half* scale_dst, + __global char* zp_dst +) { + const uint expert_idx = get_global_id(0); + const uint group_num = get_global_size(1); + const uint n_num = get_global_size(2) * 2; + const uint group_idx = get_global_id(1); + const uint n_idx = get_global_id(2) * 2; + + // Source: [Experts, Groups, N] + // Dest: [Experts, N, Groups] + +#if 0 + const uint src_offset = expert_idx * n_num * group_num + group_idx * n_num + n_idx; + const uint dst_offset = expert_idx * n_num * group_num + n_idx * group_num + group_idx; + + scale_src += src_offset; + zp_src += src_offset / 2; + scale_dst += dst_offset; + zp_dst += dst_offset / 2; + + half2 src_value = as_half2(intel_sub_group_block_read_us2((const __global ushort *)(scale_src))); + intel_sub_group_block_write_us2((__global ushort *)(scale_dst), as_ushort2(src_value)); + + char zp_value = zp_src[0]; + zp_dst[0] = as_uchar(zp_value); +#else + // Calculate offsets for Scale (Source) + // src index for (e, g, n) + const uint src_idx_0 = expert_idx * group_num * n_num + group_idx * n_num + n_idx; + + // Read Scale + half s0 = scale_src[src_idx_0]; + half s1 = scale_src[src_idx_0 + 1]; + + // Calculate offsets for Scale (Dest) + // dst index for (e, n, g) + const uint dst_idx_0 = expert_idx * n_num * group_num + n_idx * group_num + group_idx; + // dst index for (e, n+1, g) + const uint dst_idx_1 = expert_idx * n_num * group_num + (n_idx + 1) * group_num + group_idx; + + // Write Scale + scale_dst[dst_idx_0] = s0; + scale_dst[dst_idx_1] = s1; + + // Handle ZP + // Only even groups process ZP to avoid race condition on byte writes + if (group_idx % 2 == 0) { + // We need zp(g, n), zp(g, n+1) -> from src(e, g, n/2) + // We need zp(g+1, n), zp(g+1, n+1) -> from src(e, g+1, n/2) + + // Src ZP index for (e, g, n/2) + // Note: zp_src is char*, so index is byte index. + // src_idx_0 is element index. ZP packed 2 elements per byte. + uint zp_src_idx_g0 = src_idx_0 / 2; + + // Src ZP index for (e, g+1, n/2) + // Offset difference between g and g+1 is n_num elements. + uint zp_src_idx_g1 = zp_src_idx_g0 + (n_num / 2); + + char byte_g0 = zp_src[zp_src_idx_g0]; + char byte_g1 = zp_src[zp_src_idx_g1]; + + // Unpack + // Assuming Little Endian packing: Low nibble = even index (n), High nibble = odd index (n+1) + uchar ubyte_g0 = as_uchar(byte_g0); + uchar ubyte_g1 = as_uchar(byte_g1); + + uchar zp_g0_n0 = ubyte_g0 & 0x0F; + uchar zp_g0_n1 = (ubyte_g0 >> 4) & 0x0F; + + uchar zp_g1_n0 = ubyte_g1 & 0x0F; + uchar zp_g1_n1 = (ubyte_g1 >> 4) & 0x0F; + + // Pack for Dest + // Dest packing is along Groups. + // Byte at (e, n, g/2) contains zp(e, n, g) [Low] and zp(e, n, g+1) [High] + + uchar dst_byte_n0 = zp_g0_n0 | (zp_g1_n0 << 4); + uchar dst_byte_n1 = zp_g0_n1 | (zp_g1_n1 << 4); + + // Dest ZP indices + // dst_idx_0 corresponds to (e, n, g). + // ZP array is packed along G. + // So index is dst_idx_0 / 2. + uint zp_dst_idx_n0 = dst_idx_0 / 2; + + // dst_idx_1 corresponds to (e, n+1, g). + uint zp_dst_idx_n1 = dst_idx_1 / 2; + + zp_dst[zp_dst_idx_n0] = as_char(dst_byte_n0); + zp_dst[zp_dst_idx_n1] = as_char(dst_byte_n1); + } +#endif +} #endif From 7569a3e37951cce49aa10d8db833d3dd88c2d009 Mon Sep 17 00:00:00 2001 From: River Date: Thu, 27 Nov 2025 16:31:21 +0800 Subject: [PATCH 07/20] Fixed Out of resource --- .../src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index b9f0e03e6784e4..433b940e181092 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -1645,7 +1645,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 0: final hidden states, shape = [token_len, hidden_size] { auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); - auto token_size = input_shape[0] * max_topk; + auto token_size = input_shape[0];// * max_topk; auto [local_threads_count, batches_per_thread, _] = calc_thread_count(const_cast(*instance.get_impl_params()), 4, _hidden_size); #if DEBUG_MOE_LOG From b923de00cfabb04d6fafa3937391b1852268eb10 Mon Sep 17 00:00:00 2001 From: River Date: Fri, 28 Nov 2025 08:41:39 +0800 Subject: [PATCH 08/20] Fix output incorrect issue by random --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 46 +++++++++---------- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 9 ++-- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index 216c7f38b0c879..b8955a3effdae3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -60,22 +60,22 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& jit.make("INPUT_SEQ_LEN", 4); // prefill not use it jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); - std::cout << "\t m_wei_idx: " << m_wei_idx << std::endl; - std::cout << "\t m_wei_idx.get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx: " << m_wei_idx << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx.get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; const auto& weight_shape = params.input_layouts[m_wei_idx].get_shape(); // u4:bfyx:4x3072x8x128:nopad size_t expert_stride = weight_shape.size() == 4 ? (weight_shape[1] * weight_shape[2] * weight_shape[3]) : (weight_shape[1] * weight_shape[2]); jit.make("EXPERT_STRIDE", expert_stride / 2); // std::cout << "\t expert_stride: " << expert_stride / 2 << std::endl; - std::cout << "\t m_scale_idx: " << m_scale_idx << std::endl; - std::cout << "\t m_scale_idx.get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t m_scale_idx: " << m_scale_idx << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t m_scale_idx.get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; if (cfg.weight_group_size > 0) { jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[1]); - std::cout << "\t NUM_GROUPS: " << params.input_layouts[m_scale_idx].get_shape()[1] << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t NUM_GROUPS: " << params.input_layouts[m_scale_idx].get_shape()[1] << std::endl; } else { jit.make("NUM_GROUPS", 1); - std::cout << "\t NUM_GROUPS: 1" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t NUM_GROUPS: 1" << std::endl; } auto desc = params.typed_desc(); @@ -169,11 +169,11 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, size_t n = is_prefill ? 32 : 8; size_t k = weight_shape.size() == 4 ? weight_shape[2] * weight_shape[3] : weight_shape[2]; - std::cout << "MoE3GemmMicroGenerator::init_microkernels: " << std::endl; - std::cout << "\t m = " << m << ", n = " << n << ", k = " << k << std::endl; + GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::init_microkernels: " << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t m = " << m << ", n = " << n << ", k = " << k << std::endl; size_t group_size = weight_shape.size() == 4 ? weight_shape[3] : weight_shape[2]; - std::cout << "\t weight group size: " << group_size << "\n"; + GPU_DEBUG_TRACE_DETAIL << "\t weight group size: " << group_size << "\n"; micro::GEMMProblem problem_moe; micro::GEMMProtocol::Options opts_moe; @@ -192,7 +192,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, problem_moe.asPtrDims = static_cast(MICRO_DIMENSIONALITY::MATRIX); problem_moe.aqGroupM = 1; - problem_moe.aqGroupK = group_size; + problem_moe.aqGroupK = static_cast(group_size); opts_moe.scaleA = true; const bool is_weight_symmetric_quantized = false; @@ -226,8 +226,8 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, sizes.k = static_cast(k); sizes.batch = static_cast(1); - std::cout << "problem_moe:" << problem_moe.toString() << "\n"; - std::cout << "sizes to select gemm : m : " << m << " n : " << n << " k : " << k << std::endl; + GPU_DEBUG_TRACE_DETAIL << "problem_moe:" << problem_moe.toString() << "\n"; + GPU_DEBUG_TRACE_DETAIL << "sizes to select gemm : m : " << m << " n : " << n << " k : " << k << std::endl; try { /* Ask microkernel provider for microkernel */ gemm_moe = micro::select_gemm_microkernel(opts_moe, hw_info, sizes, problem_moe); @@ -241,7 +241,7 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { return DispatchDataFunc{[wei_idx](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { assert(!params.is_dynamic()); - std::cout << "MoE3GemmMicroGenerator::DispatchDataFunc()" << std::endl; + // std::cout << "MoE3GemmMicroGenerator::DispatchDataFunc()" << std::endl; auto* rtp = static_cast(rt_params); const auto& device_info = params.get_device_info(); const auto& gemm_p = kd.micro_kernels[0]->p; @@ -258,16 +258,16 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { auto input_layout = params.get_input_layout(0); auto experts_weight_layout = params.get_input_layout(wei_idx); - std::cout << "\t input_layout: " << input_layout.to_short_string() << std::endl; - std::cout << "\t wei_idx = " << wei_idx << std::endl; - std::cout << "\t experts_weight_layout: " << experts_weight_layout.to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t input_layout: " << input_layout.to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t wei_idx = " << wei_idx << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t experts_weight_layout: " << experts_weight_layout.to_short_string() << std::endl; // has_batch_dim indicates whether the input tensor has batch dimension size_t n = input_layout.get_shape().size() == 3 ? input_layout.get_shape()[1] : input_layout.get_shape()[0]; auto cur_moe = params.typed_desc(); const auto& config = cur_moe->_config; n = n * config.top_k; - std::cout << "\t n = " << n << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t n = " << n << std::endl; const auto& experts_weight_shape = experts_weight_layout.get_shape(); size_t m = experts_weight_shape[1]; @@ -283,7 +283,7 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { s_k.v.s32 = static_cast(k); scalars.push_back(s_k); - std::cout << "\t m = " << m << ", k = " << k << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t m = " << m << ", k = " << k << std::endl; }}; } @@ -304,7 +304,7 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p auto desc = params.typed_desc(); auto need_repack = desc->_config.group_size != std::numeric_limits::max(); - std::cout << "MoE3GemmMicroGenerator::get_arguments_desc() need_repack: " << need_repack << std::endl; + GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::get_arguments_desc() need_repack: " << need_repack << std::endl; switch (m_type) { case MoE3GemmMicroKernelType::MLP_GATE: @@ -388,11 +388,11 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par kd.code->has_microkernels = true; try { - std::cout << "\t get_kernel_name(): " << get_kernel_name() << std::endl; - std::cout << "\t kd.code->entry_point: " << kd.code->entry_point << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t get_kernel_name(): " << get_kernel_name() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t kd.code->entry_point: " << kd.code->entry_point << std::endl; kd.code->str = build_code(get_kernel_name(), jit, kd.code->entry_point); } catch (const std::runtime_error& ex) { - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() - can't build code: " << ex.what() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::get_kernel_data() - can't build code: " << ex.what() << std::endl; OPENVINO_THROW("MoE3GemmMicroGenerator::get_kernel_data() - can't build code: ", ex.what()); } @@ -423,7 +423,7 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par kd.params.arguments.push_back({ArgumentDescriptor::Types::LOCAL_MEMORY_SIZE, slm_size}); } - std::cout << "MoE3GemmMicroGenerator::get_kernel_data() completed\n"; + GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::get_kernel_data() completed\n"; return kd; } } // namespace ov::intel_gpu::ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 433b940e181092..49baec070a9f3a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -1472,7 +1472,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::cout << "\nstep 2.5: repack scale/zp for moe_gemm" << std::endl; #endif // gate - ret_event = execute_stage(events, + ret_event = execute_stage({ret_event}, instance, *prefill_scale_zp_repack, { @@ -1486,7 +1486,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(num_total_experts), static_cast(_hidden_size/_gate_up_group_size), static_cast(_intermediate_size/2)}, {1, 1, subgroup_size}); // up - ret_event = execute_stage(events, + ret_event = execute_stage({ret_event}, instance, *prefill_scale_zp_repack, { @@ -1500,7 +1500,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(num_total_experts), static_cast(_hidden_size/_gate_up_group_size), static_cast(_intermediate_size/2)}, {1, 1, subgroup_size}); // down - ret_event = execute_stage(events, + ret_event = execute_stage({ret_event}, instance, *prefill_scale_zp_repack, { @@ -1668,7 +1668,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {local_threads_count, 1, 1}, instance.needs_completion_event(), {num_actually_used_experts}); - + // TODO: remove this sync which maybe lead to output is incorrect + stream.finish(); #if DUMP_TENSOR_CONTENTS { stream.finish(); //debug From 43b23c05cbfc6b7ab1861622f1523115b64e248d Mon Sep 17 00:00:00 2001 From: River Date: Fri, 28 Nov 2025 08:49:05 +0800 Subject: [PATCH 09/20] Fix conflict --- .../src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 49baec070a9f3a..f0cc09cfc00076 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -1942,7 +1942,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { n_token, convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.y, {static_cast(n_token), _hidden_size}, dnnl::memory::format_tag::ab), - convert2dnnl(scratch.routing_weights, {n_token * max_topk}, dnnl::memory::format_tag::a)); + convert2dnnl(scratch.routing_weights, {static_cast(n_token * max_topk)}, dnnl::memory::format_tag::a)); #if DUMP_TENSOR_CONTENTS { From 3d444718c7e9f31575d478ba2226c8d5ac930991 Mon Sep 17 00:00:00 2001 From: River Date: Tue, 2 Dec 2025 11:02:28 +0800 Subject: [PATCH 10/20] Remove repack --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 30 +--- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 129 +----------------- .../impls/ocl_v2/moe_3gemm_swiglu_fuse.cl | 100 -------------- 3 files changed, 8 insertions(+), 251 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index b8955a3effdae3..e20c2f790f0adb 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -302,9 +302,6 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p // args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); // auto cfg = get_moe_cfg(params); auto desc = params.typed_desc(); - auto need_repack = desc->_config.group_size != std::numeric_limits::max(); - - GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::get_arguments_desc() need_repack: " << need_repack << std::endl; switch (m_type) { case MoE3GemmMicroKernelType::MLP_GATE: @@ -316,13 +313,8 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - if(need_repack) { - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 13}); // repacked scale buffer - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 14}); // repacked zp buffer - } else { - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_0)}); // zp - } + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_0)}); // zp break; case MoE3GemmMicroKernelType::MLP_UP: args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor @@ -333,13 +325,8 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - if(need_repack) { - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 15}); // repacked scale buffer - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 16}); // repacked zp buffer - } else { - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_1)}); // zp - } + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_1)}); // zp break; case MoE3GemmMicroKernelType::MLP_DOWN: args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // intermediate_mem[6] @@ -350,13 +337,8 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - if(need_repack) { - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 17}); // repacked scale buffer - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 18}); // repacked zp buffer - } else { - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_2)}); // zp - } + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_2)}); // zp break; default: OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index f0cc09cfc00076..0e2fb435b7a417 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -491,35 +491,6 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { } }; -class MoE3GemmSwigluPrefillRepack : public KernelGenerator { -public: - MoE3GemmSwigluPrefillRepack() : KernelGenerator("moe_3gemm_swiglu_fuse", "prefill_repack") {} - -protected: - [[nodiscard]] JitConstants get_jit_constants(const RuntimeParams& params) const override { - auto jit = KernelGenerator::get_jit_constants(params); - auto desc = params.typed_desc(); - auto& engine = params.prog->get_engine(); - const auto& info = engine.get_device_info(); - - jit.make("PREFILL_SCALE_ZP_REPACK", 1); - jit.make("SUBGROUP_SIZE", info.arch >= gpu_arch::xe2 ? 32 : 16); - jit.make("INTERMEDIA_SIZE", desc->_config.inter_size); - return jit; - } - - [[nodiscard]] Arguments get_arguments_desc(const RuntimeParams& params) const override { - Arguments args; - - return args; - } - - [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; - } -}; - class MoE3GemmSwigluPrefillSwiglu : public KernelGenerator { public: MoE3GemmSwigluPrefillSwiglu() : KernelGenerator("moe_3gemm_swiglu_fuse", "prefill_swiglu") {} @@ -730,7 +701,6 @@ dnnl::memory convert2dnnl(const memory::ptr& ptr, const std::vector& di } static bool use_micro_gemm_prefill; -static bool need_repack_scale_zp; class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { public: DECLARE_OBJECT_TYPE_SERIALIZATION(ov::intel_gpu::ocl::MoE3GemmSwigluImpl) @@ -747,7 +717,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { Stage::Ptr micro_gemm_down = make_stage(MoE3GemmMicroKernelType::MLP_DOWN); Stage::Ptr prefill_swiglu = make_stage(); Stage::Ptr prefill_scatter_reduce = make_stage(); - Stage::Ptr prefill_scale_zp_repack = make_stage(); struct dnnl_weights { dnnl::memory weight; @@ -849,7 +818,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(prefill_swiglu, params); add_stage(micro_gemm_down, params); add_stage(prefill_scatter_reduce, params); - add_stage(prefill_scale_zp_repack, params); } } @@ -858,15 +826,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { _intermediate_size = static_cast(cur_moe->_config.inter_size); _gate_up_group_size = static_cast(cur_moe->_config.group_size); _down_group_size = static_cast(cur_moe->_config.group_size); - // auto rtp = static_cast(m_rt_params.get()); - - if (cur_moe->_config.group_size == std::numeric_limits::max()) { - _gate_up_group_size = static_cast(cur_moe->_config.hidden_size); - _down_group_size = static_cast(cur_moe->_config.inter_size); - need_repack_scale_zp = false; - } else { - need_repack_scale_zp = true; - } GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt prefill: group_size=" << cur_moe->_config.group_size << ", gate_up_group_size=" << _gate_up_group_size << ", down_group_size=" << _down_group_size << std::endl; @@ -936,15 +895,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { const auto& config = cur_moe->_config; size_t max_topk = static_cast(config.top_k); size_t expert_num = static_cast(config.num_expert); - auto hidden_states_layout = params.input_layouts[0]; auto batch = static_cast(hidden_states_layout.get_shape()[0]); auto data_type = hidden_states_layout.data_type; - if (cur_moe->_config.group_size == std::numeric_limits::max()) { - need_repack_scale_zp = false; - } else { - need_repack_scale_zp = true; - } std::vector internal_buffers; // softmax+topk @@ -973,7 +926,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { internal_buffers.emplace_back(index_layout, true); // 8: topk GPU_DEBUG_TRACE_DETAIL << "[DEBUG] get_internal_buffer_descs(): use_micro_gemm_prefill=" << use_micro_gemm_prefill - << ", need_repack_scale_zp=" << need_repack_scale_zp << std::endl; + << std::endl; // for micro_gemm if(use_micro_gemm_prefill && batch > 1) { layout layout_micro_gemm(ov::Shape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); @@ -982,24 +935,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { internal_buffers.emplace_back(layout_micro_gemm, true); // 11: token len (input gather tokens) for each activated expert layout layout_token_idx(ov::Shape{batch * max_topk}, ov::element::i32, cldnn::format::bfyx); internal_buffers.emplace_back(layout_token_idx, true); // 12: token idx per expert - - // repack scale/zp - if(need_repack_scale_zp) { - auto gate_up_scale_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::SCALE_0)].get_shape(); - layout gate_up_scale_layout(ov::Shape{expert_num, gate_up_scale_shape[2], gate_up_scale_shape[1]}, ov::element::f16, cldnn::format::bfyx); - auto gate_up_zp_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::ZP_0)].get_shape(); - layout gate_up_zp_layout(ov::Shape{expert_num, gate_up_zp_shape[2], gate_up_zp_shape[1]}, ov::element::u4, cldnn::format::bfyx); - internal_buffers.emplace_back(gate_up_scale_layout, true); // 13: gate_scale - internal_buffers.emplace_back(gate_up_zp_layout, true); // 14: gate_zp - internal_buffers.emplace_back(gate_up_scale_layout, true); // 15: up_scale - internal_buffers.emplace_back(gate_up_zp_layout, true); // 16: up_zp - auto down_scale_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::SCALE_2)].get_shape(); - layout down_scale_layout(ov::Shape{expert_num, down_scale_shape[2], down_scale_shape[1]}, ov::element::f16, cldnn::format::bfyx); - auto down_zp_shape = params.input_layouts[static_cast(MOE3GemmInputIndex::ZP_2)].get_shape(); - layout down_zp_layout(ov::Shape{expert_num, down_zp_shape[2], down_zp_shape[1]}, ov::element::u4, cldnn::format::bfyx); - internal_buffers.emplace_back(down_scale_layout, true); // 17: down_scale - internal_buffers.emplace_back(down_zp_layout, true); // 18: down_zp - } } return internal_buffers; } @@ -1466,65 +1401,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { #endif } - // step 2.5: repack scale/zp if needed - if (need_repack_scale_zp) { - #if DEBUG_MOE_LOG - std::cout << "\nstep 2.5: repack scale/zp for moe_gemm" << std::endl; - #endif - // gate - ret_event = execute_stage({ret_event}, - instance, - *prefill_scale_zp_repack, - { - instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_0)), - instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_0)), - }, - { - intermediates_memories[13], - intermediates_memories[14], - }, - {static_cast(num_total_experts), static_cast(_hidden_size/_gate_up_group_size), static_cast(_intermediate_size/2)}, - {1, 1, subgroup_size}); - // up - ret_event = execute_stage({ret_event}, - instance, - *prefill_scale_zp_repack, - { - instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)), - instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)), - }, - { - intermediates_memories[15], - intermediates_memories[16], - }, - {static_cast(num_total_experts), static_cast(_hidden_size/_gate_up_group_size), static_cast(_intermediate_size/2)}, - {1, 1, subgroup_size}); - // down - ret_event = execute_stage({ret_event}, - instance, - *prefill_scale_zp_repack, - { - instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_2)), - instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_2)), - }, - { - intermediates_memories[17], - intermediates_memories[18], - }, - {static_cast(num_total_experts), static_cast(_intermediate_size/_down_group_size), static_cast(_hidden_size/2)}, - {1, 1, subgroup_size}); - - #if DUMP_TENSOR_CONTENTS - { - stream.finish(); //debug - print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)), "up_scale_1"); - print_mem_f16(stream, intermediates_memories[15], "repack_up_scale"); - print_mem_u4(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)), "up_zp_1"); - print_mem_u4(stream, intermediates_memories[16], "repack_up_zp"); - } - #endif - } - // step 3: moe_gemm for up and gate // input // 0: gathered token, shape = [token_len * expert_topK, hidden_size] @@ -1840,8 +1716,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { get_expert_mask_from_gpu(config, topk_id_mem, stream, expert_mask); GPU_DEBUG_TRACE_DETAIL << "\nMoE3GemmFusedCompressed exec(): batch=" << batch << ", max_topk=" << max_topk - << ", use_micro_gemm_prefill=" << use_micro_gemm_prefill - << ", need_repack_scale_zp=" << need_repack_scale_zp << std::endl; + << ", use_micro_gemm_prefill=" << use_micro_gemm_prefill << std::endl; if (use_micro_gemm_prefill) { GPU_DEBUG_TRACE_DETAIL << "\nUse micro_gemm prefill path" << std::endl; update_rt_params(instance); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl index 7138fa370cae8e..0ab78a37340669 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl @@ -158,104 +158,4 @@ KERNEL(swiglu_ref) ( #endif } -#elif PREFILL_SCALE_ZP_REPACK -__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) -KERNEL(repack_ref) ( - const __global half* scale_src, - const __global char* zp_src, - __global half* scale_dst, - __global char* zp_dst -) { - const uint expert_idx = get_global_id(0); - const uint group_num = get_global_size(1); - const uint n_num = get_global_size(2) * 2; - const uint group_idx = get_global_id(1); - const uint n_idx = get_global_id(2) * 2; - - // Source: [Experts, Groups, N] - // Dest: [Experts, N, Groups] - -#if 0 - const uint src_offset = expert_idx * n_num * group_num + group_idx * n_num + n_idx; - const uint dst_offset = expert_idx * n_num * group_num + n_idx * group_num + group_idx; - - scale_src += src_offset; - zp_src += src_offset / 2; - scale_dst += dst_offset; - zp_dst += dst_offset / 2; - - half2 src_value = as_half2(intel_sub_group_block_read_us2((const __global ushort *)(scale_src))); - intel_sub_group_block_write_us2((__global ushort *)(scale_dst), as_ushort2(src_value)); - - char zp_value = zp_src[0]; - zp_dst[0] = as_uchar(zp_value); -#else - // Calculate offsets for Scale (Source) - // src index for (e, g, n) - const uint src_idx_0 = expert_idx * group_num * n_num + group_idx * n_num + n_idx; - - // Read Scale - half s0 = scale_src[src_idx_0]; - half s1 = scale_src[src_idx_0 + 1]; - - // Calculate offsets for Scale (Dest) - // dst index for (e, n, g) - const uint dst_idx_0 = expert_idx * n_num * group_num + n_idx * group_num + group_idx; - // dst index for (e, n+1, g) - const uint dst_idx_1 = expert_idx * n_num * group_num + (n_idx + 1) * group_num + group_idx; - - // Write Scale - scale_dst[dst_idx_0] = s0; - scale_dst[dst_idx_1] = s1; - - // Handle ZP - // Only even groups process ZP to avoid race condition on byte writes - if (group_idx % 2 == 0) { - // We need zp(g, n), zp(g, n+1) -> from src(e, g, n/2) - // We need zp(g+1, n), zp(g+1, n+1) -> from src(e, g+1, n/2) - - // Src ZP index for (e, g, n/2) - // Note: zp_src is char*, so index is byte index. - // src_idx_0 is element index. ZP packed 2 elements per byte. - uint zp_src_idx_g0 = src_idx_0 / 2; - - // Src ZP index for (e, g+1, n/2) - // Offset difference between g and g+1 is n_num elements. - uint zp_src_idx_g1 = zp_src_idx_g0 + (n_num / 2); - - char byte_g0 = zp_src[zp_src_idx_g0]; - char byte_g1 = zp_src[zp_src_idx_g1]; - - // Unpack - // Assuming Little Endian packing: Low nibble = even index (n), High nibble = odd index (n+1) - uchar ubyte_g0 = as_uchar(byte_g0); - uchar ubyte_g1 = as_uchar(byte_g1); - - uchar zp_g0_n0 = ubyte_g0 & 0x0F; - uchar zp_g0_n1 = (ubyte_g0 >> 4) & 0x0F; - - uchar zp_g1_n0 = ubyte_g1 & 0x0F; - uchar zp_g1_n1 = (ubyte_g1 >> 4) & 0x0F; - - // Pack for Dest - // Dest packing is along Groups. - // Byte at (e, n, g/2) contains zp(e, n, g) [Low] and zp(e, n, g+1) [High] - - uchar dst_byte_n0 = zp_g0_n0 | (zp_g1_n0 << 4); - uchar dst_byte_n1 = zp_g0_n1 | (zp_g1_n1 << 4); - - // Dest ZP indices - // dst_idx_0 corresponds to (e, n, g). - // ZP array is packed along G. - // So index is dst_idx_0 / 2. - uint zp_dst_idx_n0 = dst_idx_0 / 2; - - // dst_idx_1 corresponds to (e, n+1, g). - uint zp_dst_idx_n1 = dst_idx_1 / 2; - - zp_dst[zp_dst_idx_n0] = as_char(dst_byte_n0); - zp_dst[zp_dst_idx_n1] = as_char(dst_byte_n1); - } -#endif -} #endif From 649c52f4e0efaa0d07f327c017d11685c566e13f Mon Sep 17 00:00:00 2001 From: River Date: Tue, 2 Dec 2025 12:39:49 +0800 Subject: [PATCH 11/20] remove some debug code --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 46 +++++++++---------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index e20c2f790f0adb..a6cdad07a5651a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -10,9 +10,7 @@ #include "intel_gpu/graph/kernel_impl_params.hpp" #include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp" -// #include "intel_gpu/primitives/moe_gemm.hpp" #include "ocl_v2/utils/jitter.hpp" -// #include "moe_gemm_inst.h" #include "../utils/kernel_generator.hpp" // clang-format on @@ -63,10 +61,10 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx: " << m_wei_idx << std::endl; GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx.get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; const auto& weight_shape = params.input_layouts[m_wei_idx].get_shape(); - // u4:bfyx:4x3072x8x128:nopad + // weight layout: u4:bfyx:4x3072x8x128:nopad size_t expert_stride = weight_shape.size() == 4 ? (weight_shape[1] * weight_shape[2] * weight_shape[3]) : (weight_shape[1] * weight_shape[2]); jit.make("EXPERT_STRIDE", expert_stride / 2); - // std::cout << "\t expert_stride: " << expert_stride / 2 << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t expert_stride: " << expert_stride / 2 << std::endl; GPU_DEBUG_TRACE_DETAIL << "\t m_scale_idx: " << m_scale_idx << std::endl; GPU_DEBUG_TRACE_DETAIL << "\t m_scale_idx.get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; @@ -79,32 +77,26 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& } auto desc = params.typed_desc(); - switch(m_type) { - case MoE3GemmMicroKernelType::MLP_GATE: - case MoE3GemmMicroKernelType::MLP_UP: - // f16:bfyx:[?,2048]:nopad - jit.make("INPUT_STRIDE", desc->_config.hidden_size); - jit.make("OUTPUT_STRIDE", desc->_config.inter_size); - // std::cout << "\t INPUT_STRIDE: " << desc->_config.hidden_size << std::endl; - // std::cout << "\t OUTPUT_STRIDE: " << desc->_config.inter_size << std::endl; - break; - case MoE3GemmMicroKernelType::MLP_DOWN: - jit.make("INPUT_STRIDE", desc->_config.inter_size); - jit.make("OUTPUT_STRIDE", desc->_config.hidden_size); - // std::cout << "\t INPUT_STRIDE: " << desc->_config.inter_size << std::endl; - // std::cout << "\t OUTPUT_STRIDE: " << desc->_config.hidden_size << std::endl; - break; - default: - OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); - break; + switch (m_type) { + case MoE3GemmMicroKernelType::MLP_GATE: + case MoE3GemmMicroKernelType::MLP_UP: + // f16:bfyx:[?,2048]:nopad + jit.make("INPUT_STRIDE", desc->_config.hidden_size); + jit.make("OUTPUT_STRIDE", desc->_config.inter_size); + break; + case MoE3GemmMicroKernelType::MLP_DOWN: + jit.make("INPUT_STRIDE", desc->_config.inter_size); + jit.make("OUTPUT_STRIDE", desc->_config.hidden_size); + break; + default: + OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); + break; } auto slm_size = moe_gemm.getSetting("slm_size"); - // std::cout << "MoE3GemmMicroGenerator::get_jit_constants() slm_size: " << slm_size << std::endl; if (slm_size > 0) jit.make("USE_SLM", 1); - // std::cout << "MoE3GemmMicroGenerator::get_jit_constants() done " << std::endl; return jit; } @@ -163,6 +155,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, break; } + // weight layout example: u4:bfyx:4x3072x8x128:nopad const auto& weight_shape = params.get_input_layout(wei_idx).get_shape(); const bool is_prefill = true; size_t m = weight_shape[1]; @@ -186,7 +179,9 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, problem_moe.Ta_ext = convert_type(params.get_input_layout(wei_idx).data_type); problem_moe.A.setAlignment(micro::alignment_for_ld(k * problem_moe.Ta_ext)); - problem_moe.Ta_scale = convert_type(params.get_input_layout(scale_idx).data_type); // zp dt + // scale layout example: f16:bfyx:4x8x3072:nopad + const auto& scale_layout = params.get_input_layout(scale_idx); + problem_moe.Ta_scale = convert_type(scale_layout.data_type); problem_moe.A_scale.setAlignment(2); problem_moe.A_scale.layout = micro::MatrixLayout::N; problem_moe.asPtrDims = static_cast(MICRO_DIMENSIONALITY::MATRIX); @@ -197,6 +192,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, opts_moe.scaleA = true; const bool is_weight_symmetric_quantized = false; if (!is_weight_symmetric_quantized) { + // zp layout example: u4:bfyx:4x8x3072:nopad const auto& zp_layout = params.get_input_layout(zp_idx); const auto zp_dt = convert_type(zp_layout.data_type); problem_moe.Tao = zp_dt; From 1b6faa2fb1477200cc4cecd7b04ae455d3872e37 Mon Sep 17 00:00:00 2001 From: River Date: Tue, 2 Dec 2025 21:51:48 +0800 Subject: [PATCH 12/20] Fix minor issue --- .../src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 0e2fb435b7a417..9e1c1f17ccfc6b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -827,6 +827,10 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { _gate_up_group_size = static_cast(cur_moe->_config.group_size); _down_group_size = static_cast(cur_moe->_config.group_size); + if (cur_moe->_config.group_size == std::numeric_limits::max()) { + _gate_up_group_size = static_cast(cur_moe->_config.hidden_size); + _down_group_size = static_cast(cur_moe->_config.inter_size); + } GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt prefill: group_size=" << cur_moe->_config.group_size << ", gate_up_group_size=" << _gate_up_group_size << ", down_group_size=" << _down_group_size << std::endl; } From 9c8f21b9bab0fd703578e39e09896ee1b7c1f819 Mon Sep 17 00:00:00 2001 From: River Date: Wed, 3 Dec 2025 09:02:19 +0800 Subject: [PATCH 13/20] Fixed accuracy issue when group_size=128 --- .../src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 1 + .../intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index a6cdad07a5651a..049ad770b5575f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -57,6 +57,7 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& jit.make("IS_GENERATE", 0); // prefill jit.make("INPUT_SEQ_LEN", 4); // prefill not use it jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); + jit.make("SCALE_ZP_NO_TRANSPOSE", 1); GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx: " << m_wei_idx << std::endl; GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx.get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl index 8870cba94c6ac4..538dcbd41c855c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_gemm.cl @@ -84,7 +84,13 @@ KERNEL(moe_gemm)(OPTIONAL_SHAPE_INFO_ARG uint sg_i0 = wg_i0 + sg_i * ugemm_moe_sg_tile_m; uint sg_j0 = wg_j0 + sg_j * ugemm_moe_sg_tile_n; #ifdef WEIGHT_COMPRESSED_INT4 - uint num_groups = NUM_GROUPS; +#ifdef SCALE_ZP_NO_TRANSPOSE + /* This parameter is the leading dimension for scales/zp. Since scales/zp are non-transpose, + the leading dimension is the stride between successive groups in the k dimension. */ + uint scale_zp_leading_dim = m; +#else + uint scale_zp_leading_dim = NUM_GROUPS; +#endif #endif if (wg_j0 >= cur_n_tokens) return; /* early exit if outside batch */ @@ -98,7 +104,7 @@ KERNEL(moe_gemm)(OPTIONAL_SHAPE_INFO_ARG #ifdef WEIGHT_ZP_DT , weight_zps #endif - , num_groups + , scale_zp_leading_dim #endif ); ugemm_moe_c_type_half c_tile_half; From 3ec888647bf7160f0faef0499c438ef7ce873651 Mon Sep 17 00:00:00 2001 From: River Date: Wed, 3 Dec 2025 11:33:11 +0800 Subject: [PATCH 14/20] refactor code --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 62 ++- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp | 3 +- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 414 +++++++++--------- 3 files changed, 248 insertions(+), 231 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index 049ad770b5575f..e6951764c56f80 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -29,14 +29,19 @@ static size_t get_subgroup_size(gpu_arch arch) { case gpu_arch::xe3: return 16; default: - return 0; + // fallback to safe value + return 8; } } JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& params, const micro::Package& moe_gemm, const moe_3gemm_config& cfg) const { const auto& device_info = params.get_device_info(); + const size_t subgroup_size = get_subgroup_size(device_info.arch); + const auto& weight_layout = params.input_layouts[m_wei_idx]; + const auto& scale_layout = params.input_layouts[m_scale_idx]; + const auto& zp_layout = params.input_layouts[m_zp_idx]; - // auto jit = make_base_jit_constants(params); + // Internal generator of JIT constants, require intermediate buffers and part of primitive's inputs. JitConstants jit; auto entry_point = get_entry_point(params); jit.add(make_jit_constant("KERNEL(name)", "__kernel void " + entry_point)); @@ -44,34 +49,47 @@ JitConstants MoE3GemmMicroGenerator::get_jit_constants(const kernel_impl_params& jit.make("OPTIONAL_SHAPE_INFO_ARG", ""); jit.make("OPTIONAL_SHAPE_INFO_TENSOR", ""); - jit.make("SUBGROUP_SIZE", get_subgroup_size(device_info.arch)); - jit.make("OUTPUT_TYPE", to_ocl_type(data_types::f16)); // output - jit.make("INPUT0_TYPE", to_ocl_type(data_types::f16)); // input: f16 - jit.make("INPUT1_TYPE", to_ocl_type(data_types::u8)); // weight: u4 - jit.make("INPUT2_TYPE", to_ocl_type(data_types::i32)); // experts_ids: i32 - jit.make("INPUT3_TYPE", to_ocl_type(data_types::i32)); // input_offset_per_expert: i32 - jit.make("INPUT4_TYPE", to_ocl_type(data_types::i32)); // n_array: i32 - jit.make("WEIGHT_SCALE_DT", to_ocl_type(data_types::f16)); // scale: f16 - jit.make("WEIGHT_ZP_DT", to_ocl_type(data_types::u8)); // zp: u4 - jit.make("WEIGHT_COMPRESSED_INT4", 1); - jit.make("IS_GENERATE", 0); // prefill + jit.make("SUBGROUP_SIZE", subgroup_size); + jit.make("OUTPUT_TYPE", to_ocl_type(data_types::f16)); // output + jit.make("INPUT0_TYPE", to_ocl_type(data_types::f16)); // input: f16 + if (weight_layout.data_type == ov::element::u4 || weight_layout.data_type == ov::element::i4) { + jit.make("INPUT1_TYPE", to_ocl_type(data_types::u8)); // weight: u4/i4 + jit.make("WEIGHT_COMPRESSED_INT4", 1); + } else { + jit.make("INPUT1_TYPE", to_ocl_type(weight_layout.data_type)); // weight type + jit.make("WEIGHT_COMPRESSED_INT4", 0); + } + jit.make("INPUT2_TYPE", to_ocl_type(data_types::i32)); // experts_ids: i32 + jit.make("INPUT3_TYPE", to_ocl_type(data_types::i32)); // input_offset_per_expert: i32 + jit.make("INPUT4_TYPE", to_ocl_type(data_types::i32)); // n_array: i32 + jit.make("WEIGHT_SCALE_DT", to_ocl_type(scale_layout.data_type)); // scale: f16 + + if (zp_layout.data_type == ov::element::u4 || zp_layout.data_type == ov::element::i4) { + jit.make("WEIGHT_ZP_DT", to_ocl_type(data_types::u8)); // zp: u4/i4 + jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); + } else { + jit.make("WEIGHT_ZP_DT", to_ocl_type(zp_layout.data_type)); // zp type + jit.make("WEIGHT_COMPRESSED_ZP_INT4", 0); + } + + jit.make("IS_GENERATE", 0); // only for prefill jit.make("INPUT_SEQ_LEN", 4); // prefill not use it - jit.make("WEIGHT_COMPRESSED_ZP_INT4", 1); jit.make("SCALE_ZP_NO_TRANSPOSE", 1); GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx: " << m_wei_idx << std::endl; - GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx.get_shape(): " << params.input_layouts[m_wei_idx].to_short_string() << std::endl; - const auto& weight_shape = params.input_layouts[m_wei_idx].get_shape(); + GPU_DEBUG_TRACE_DETAIL << "\t m_wei_idx.get_shape(): " << weight_layout.to_short_string() << std::endl; + const auto& weight_shape = weight_layout.get_shape(); // weight layout: u4:bfyx:4x3072x8x128:nopad size_t expert_stride = weight_shape.size() == 4 ? (weight_shape[1] * weight_shape[2] * weight_shape[3]) : (weight_shape[1] * weight_shape[2]); jit.make("EXPERT_STRIDE", expert_stride / 2); GPU_DEBUG_TRACE_DETAIL << "\t expert_stride: " << expert_stride / 2 << std::endl; GPU_DEBUG_TRACE_DETAIL << "\t m_scale_idx: " << m_scale_idx << std::endl; - GPU_DEBUG_TRACE_DETAIL << "\t m_scale_idx.get_shape(): " << params.input_layouts[m_scale_idx].to_short_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "\t m_scale_idx.get_shape(): " << scale_layout.to_short_string() << std::endl; if (cfg.weight_group_size > 0) { - jit.make("NUM_GROUPS", params.input_layouts[m_scale_idx].get_shape()[1]); - GPU_DEBUG_TRACE_DETAIL << "\t NUM_GROUPS: " << params.input_layouts[m_scale_idx].get_shape()[1] << std::endl; + const auto scale_shape = scale_layout.get_shape(); + jit.make("NUM_GROUPS", scale_shape[1]); + GPU_DEBUG_TRACE_DETAIL << "\t NUM_GROUPS: " << scale_shape[1] << std::endl; } else { jit.make("NUM_GROUPS", 1); GPU_DEBUG_TRACE_DETAIL << "\t NUM_GROUPS: 1" << std::endl; @@ -267,10 +285,11 @@ DispatchDataFunc MoE3GemmMicroGenerator::get_dispatch_data_func() const { GPU_DEBUG_TRACE_DETAIL << "\t n = " << n << std::endl; const auto& experts_weight_shape = experts_weight_layout.get_shape(); + const size_t subgroup_size = get_subgroup_size(device_info.arch); size_t m = experts_weight_shape[1]; size_t k = experts_weight_shape.size() == 4 ? experts_weight_shape[2] * experts_weight_shape[3] : experts_weight_shape[2]; - wgs.local = {sg_per_wg_m * get_subgroup_size(device_info.arch), sg_per_wg_n, 1}; - wgs.global = {align_to(ceil_div(m, sg_tile_m), sg_per_wg_m) * get_subgroup_size(device_info.arch), + wgs.local = {sg_per_wg_m * subgroup_size, sg_per_wg_n, 1}; + wgs.global = {align_to(ceil_div(m, sg_tile_m), sg_per_wg_m) * subgroup_size, align_to(ceil_div(n, sg_tile_n), sg_per_wg_n), static_cast(rtp->num_actually_used_experts)}; ScalarDescriptor s_m{ScalarDescriptor::Types::INT32}; @@ -371,7 +390,6 @@ KernelData MoE3GemmMicroGenerator::get_kernel_data(const kernel_impl_params& par GPU_DEBUG_TRACE_DETAIL << "\t kd.code->entry_point: " << kd.code->entry_point << std::endl; kd.code->str = build_code(get_kernel_name(), jit, kd.code->entry_point); } catch (const std::runtime_error& ex) { - GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::get_kernel_data() - can't build code: " << ex.what() << std::endl; OPENVINO_THROW("MoE3GemmMicroGenerator::get_kernel_data() - can't build code: ", ex.what()); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp index ada213c48fc559..2a5ec5ed7832bd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp @@ -9,10 +9,9 @@ #include "../utils/kernel_generator.hpp" #include "common_utils/jitter.hpp" #include "intel_gpu/graph/kernel_impl_params.hpp" -#include "moe_3gemm_base.hpp" -// #include "intel_gpu/primitives/moe_gemm.hpp" #include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp" #include "micro_utils.hpp" +#include "moe_3gemm_base.hpp" #include "moe_gemm_gen_opt.hpp" #include "moe_gemm_inst.h" #include "ocl_v2/utils/jitter.hpp" diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 9e1c1f17ccfc6b..d394af6188da14 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -8,7 +8,7 @@ // clang-format on #define DUMP_TENSOR_CONTENTS 0 -#define DEBUG_MOE_LOG 0 +#define DEBUG_MOE_LOG 0 #ifdef ENABLE_ONEDNN_FOR_GPU # include @@ -365,8 +365,7 @@ class MoE3GemmSwigluSoftMaxTopK : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -395,8 +394,7 @@ class MoE3GemmSwigluGather : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -470,7 +468,7 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { jit.make("INPUT0_TYPE", "half"); jit.make("INPUT1_TYPE", "int"); jit.make("OUTPUT_TYPE", "half"); - jit.make("OPTIONAL_SHAPE_INFO_ARG",""); + jit.make("OPTIONAL_SHAPE_INFO_ARG", ""); // std::cout << "MoE3GemmSwigluPrefillGather::get_jit_constants(): hidden_size: " << hidden_size << ", block_size: " << block_size // << ", local_threads_count: " << local_threads_count << ", batches_per_thread: " << batches_per_thread @@ -486,8 +484,7 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -516,8 +513,7 @@ class MoE3GemmSwigluPrefillSwiglu : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -536,21 +532,21 @@ class MoE3GemmSwigluPrefillScatterReduce : public KernelGenerator { auto block_size = 4; auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(params), block_size, hidden_size); - jit.make("OPTIONAL_SHAPE_INFO_ARG",""); + jit.make("OPTIONAL_SHAPE_INFO_ARG", ""); jit.make("ACTIVE_EXPERTS", desc->_config.top_k); jit.make("HIDDEN_SIZE", hidden_size); jit.make("VEC_BLK_SIZE", 4); jit.make("BATCHES_PER_THREAD", batches_per_thread); jit.make("SET_ACTUAL_USED_EXPERTS_NUM", 1); - jit.make("INPUT0_TYPE", "half"); - jit.make("INPUT1_TYPE", "int"); - jit.make("INPUT2_TYPE", "half"); - jit.make("INPUT3_TYPE", "int"); - jit.make("INPUT4_TYPE", "int"); - jit.make("INPUT5_TYPE", "int"); - jit.make("INPUT6_TYPE", "int"); - jit.make("OUTPUT_TYPE", "half"); + jit.make("INPUT0_TYPE", "half"); // mlp_down output + jit.make("INPUT1_TYPE", "int"); // expert indices per token + jit.make("INPUT2_TYPE", "half"); // experts router weights + jit.make("INPUT3_TYPE", "int"); // tokens per expert + jit.make("INPUT4_TYPE", "int"); // expert start offsets + jit.make("INPUT5_TYPE", "int"); // tokens len for experts + jit.make("INPUT6_TYPE", "int"); // expert id + jit.make("OUTPUT_TYPE", "half"); // output return jit; } @@ -562,8 +558,7 @@ class MoE3GemmSwigluPrefillScatterReduce : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -588,8 +583,7 @@ class MoE3GemmSwigluScatter : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -642,8 +636,7 @@ class MoE3GemmSwigluMLPGateUp : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -666,8 +659,7 @@ class MoE3GemmSwigluMLPDown : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -690,8 +682,7 @@ class MoE3GemmSwigluMLPReduce : public KernelGenerator { } [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override { - return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) { - }}; + return DispatchDataFunc{[](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {}}; } }; @@ -713,7 +704,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { Stage::Ptr prefill_gather = make_stage(); Stage::Ptr micro_gemm_gate = make_stage(MoE3GemmMicroKernelType::MLP_GATE); - Stage::Ptr micro_gemm_up = make_stage(MoE3GemmMicroKernelType::MLP_UP); + Stage::Ptr micro_gemm_up = make_stage(MoE3GemmMicroKernelType::MLP_UP); Stage::Ptr micro_gemm_down = make_stage(MoE3GemmMicroKernelType::MLP_DOWN); Stage::Ptr prefill_swiglu = make_stage(); Stage::Ptr prefill_scatter_reduce = make_stage(); @@ -785,13 +776,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // auto rtp = static_cast(m_rt_params.get()); init(node.as().get_primitive()); - add_stage(softmax_topk, params); - add_stage(gather, params); - add_stage(scatter, params); - add_stage(mlp_gate_up, params); - add_stage(mlp_down, params); - add_stage(mlp_reduce, params); - auto use_micro_gemm_prefill_str = std::getenv("MOE_USE_MICRO_GEMM_PREFILL"); if (use_micro_gemm_prefill_str) { GPU_DEBUG_TRACE_DETAIL << "MOE_USE_MICRO_GEMM_PREFILL = " << use_micro_gemm_prefill_str << std::endl; @@ -811,6 +795,10 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { << ", arch=" << static_cast(info.arch) << std::endl; } + add_stage(softmax_topk, params); + add_stage(mlp_gate_up, params); + add_stage(mlp_down, params); + add_stage(mlp_reduce, params); if (use_micro_gemm_prefill) { add_stage(prefill_gather, params); add_stage(micro_gemm_gate, params); @@ -818,6 +806,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(prefill_swiglu, params); add_stage(micro_gemm_down, params); add_stage(prefill_scatter_reduce, params); + } else { + add_stage(gather, params); + add_stage(scatter, params); } } @@ -929,10 +920,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { internal_buffers.emplace_back(index_layout, true); // 7: batch internal_buffers.emplace_back(index_layout, true); // 8: topk - GPU_DEBUG_TRACE_DETAIL << "[DEBUG] get_internal_buffer_descs(): use_micro_gemm_prefill=" << use_micro_gemm_prefill - << std::endl; + GPU_DEBUG_TRACE_DETAIL << "[DEBUG] get_internal_buffer_descs(): use_micro_gemm_prefill=" << use_micro_gemm_prefill << std::endl; // for micro_gemm - if(use_micro_gemm_prefill && batch > 1) { + if (use_micro_gemm_prefill && batch > 1) { layout layout_micro_gemm(ov::Shape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); internal_buffers.emplace_back(layout_micro_gemm, true); // 9: experts_ids for each activated expert internal_buffers.emplace_back(layout_micro_gemm, true); // 10: token start offset idx (input gather tokens) for each activated expert @@ -1178,31 +1168,31 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { return ret; } -#if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS void print_mem_f16(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, size_t max_row = 50) { auto layout = mem->get_layout().get_shape(); size_t row = 0; size_t col = 0; - switch(layout.size()) { - case 1: - row = 1; - col = layout[0]; - break; - case 2: - row = layout[0]; - col = layout[1]; - break; - case 3: - row = layout[0] * layout[1]; - col = layout[2]; - break; - case 4: - row = layout[0] * layout[1] * layout[2]; - col = layout[3]; - break; - default: - OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); + switch (layout.size()) { + case 1: + row = 1; + col = layout[0]; + break; + case 2: + row = layout[0]; + col = layout[1]; + break; + case 3: + row = layout[0] * layout[1]; + col = layout[2]; + break; + case 4: + row = layout[0] * layout[1] * layout[2]; + col = layout[3]; + break; + default: + OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); } cldnn::mem_lock lock_data{mem, stream}; @@ -1223,28 +1213,28 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { size_t row = 0; size_t col = 0; - switch(layout.size()) { - case 1: - row = 1; - col = layout[0]; - break; - case 2: - row = layout[0]; - col = layout[1]; - break; - case 3: - row = layout[0] * layout[1]; - col = layout[2]; - break; - case 4: - row = layout[0] * layout[1] * layout[2]; - col = layout[3]; - break; - default: - OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); + switch (layout.size()) { + case 1: + row = 1; + col = layout[0]; + break; + case 2: + row = layout[0]; + col = layout[1]; + break; + case 3: + row = layout[0] * layout[1]; + col = layout[2]; + break; + case 4: + row = layout[0] * layout[1] * layout[2]; + col = layout[3]; + break; + default: + OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); } - col = col / 2; //u4 + col = col / 2; // u4 cldnn::mem_lock lock_data{mem, stream}; std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; for (size_t j = 0; j < row && j < max_row; j++) { @@ -1278,12 +1268,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } std::cout << std::endl; }; -#endif +# endif - cldnn::event::ptr exec_prefill_opt(const std::vector& events, - typed_primitive_inst& instance, - scratch_buffers& scratch, - expert_mask_cpu& expert_mask_cpu) { + cldnn::event::ptr exec_prefill_micro_gemm(const std::vector& events, + typed_primitive_inst& instance, + scratch_buffers& scratch, + expert_mask_cpu& expert_mask_cpu) { auto cur_moe = instance.get_typed_desc(); int max_topk = static_cast(cur_moe->_config.top_k); @@ -1334,8 +1324,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } rtp->num_actually_used_experts = num_actually_used_experts; - // debug print - #if DEBUG_MOE_LOG +// debug print +# if DEBUG_MOE_LOG { std::cout << "\nstep 1: prefill_mask num_actually_used_experts=" << num_actually_used_experts << std::endl; std::cout << "expert_id[" << num_actually_used_experts << "]: = "; @@ -1365,7 +1355,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } std::cout << std::endl; } - #endif +# endif } // step 2: generate gather input tokens @@ -1381,11 +1371,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { calc_thread_count(const_cast(*instance.get_impl_params()), block_size, hidden_size); auto token_per_expert = intermediates_memories[12]->get_layout().get_shape()[0]; - #if DEBUG_MOE_LOG +# if DEBUG_MOE_LOG std::cout << "\nstep 2: prefill_gather local_threads_count=" << local_threads_count << ", batches_per_thread=" << batches_per_thread << ", unaligned_elements=" << unaligned_elements << ", token_per_expert=" << token_per_expert << ", block_size = " << block_size << std::endl; - #endif +# endif ret_event = execute_stage(events, instance, *prefill_gather, @@ -1394,15 +1384,15 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(token_per_expert * local_threads_count), 1, 1}, {static_cast(local_threads_count), 1, 1}); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { - stream.finish(); //debug + stream.finish(); // debug // print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); // print_mem(stream, intermediates_memories[12], "token idx per expert"); // print_mem_f16(stream, scratch.x, "gathered token"); std::cout << std::endl; } - #endif +# endif } // step 3: moe_gemm for up and gate @@ -1419,34 +1409,34 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // output: // 0: up/gate output, shape = [token_len * expert_topK, hidden_size] { - #if DEBUG_MOE_LOG +# if DEBUG_MOE_LOG std::cout << "\nstep 3: moe_gemm for up and gate" << std::endl; - #endif +# endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_up); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, intermediates_memories[4], "up_token_input"); print_mem(stream, intermediates_memories[9], "up_expert_id", num_actually_used_experts); print_mem(stream, intermediates_memories[10], "up_input_offset_per_expert", num_actually_used_experts); print_mem(stream, intermediates_memories[11], "up_token_len", num_actually_used_experts); print_mem_f16(stream, intermediates_memories[2], "up_output"); } - #endif +# endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_gate); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { - stream.finish(); //debug + stream.finish(); // debug // print_mem_f16(stream, intermediates_memories[4], "gate_token_input"); // print_mem(stream, intermediates_memories[9], "gate_expert_id", num_actually_used_experts); // print_mem(stream, intermediates_memories[10], "gate_input_offset_per_expert", num_actually_used_experts); // print_mem(stream, intermediates_memories[11], "gate_token_len", num_actually_used_experts); print_mem_f16(stream, intermediates_memories[6], "gate_output"); } - #endif +# endif } // step 4: post proc - gate_up = silu(gate)*up, silu(x)=x*sigmod(x)=x*(1+exp(-x)) @@ -1459,9 +1449,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); auto token_size = input_shape[0] * max_topk; - #if DEBUG_MOE_LOG +# if DEBUG_MOE_LOG std::cout << "\nstep 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _intermediate_size << std::endl; - #endif +# endif ret_event = execute_stage({ret_event}, instance, @@ -1470,15 +1460,15 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {intermediates_memories[6]}, {static_cast(token_size), static_cast(_intermediate_size), 1}, {1, subgroup_size, 1}); - - #if DUMP_TENSOR_CONTENTS + +# if DUMP_TENSOR_CONTENTS { - ret_event->wait(); //debug - stream.finish(); //debug + ret_event->wait(); // debug + stream.finish(); // debug print_mem_f16(stream, intermediates_memories[2], "silu_up_input"); print_mem_f16(stream, intermediates_memories[6], "silu_gate_up_output"); } - #endif +# endif } // step 5: moe_gemm for down @@ -1495,21 +1485,21 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // output: // 0: down output, shape = [token_len * expert_topK, hidden_size] { - #if DEBUG_MOE_LOG +# if DEBUG_MOE_LOG std::cout << "\nstep 5: moe_gemm for down" << std::endl; - #endif +# endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_down); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, intermediates_memories[6], "down_token_input"); // print_mem(stream, intermediates_memories[9], "down_expert_id", num_actually_used_experts); // print_mem(stream, intermediates_memories[10], "down_input_offset_per_expert", num_actually_used_experts); // print_mem(stream, intermediates_memories[11], "down_token_len", num_actually_used_experts); print_mem_f16(stream, intermediates_memories[3], "down_output"); } - #endif +# endif } // step 6: scatter and reduce @@ -1525,13 +1515,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 0: final hidden states, shape = [token_len, hidden_size] { auto input_shape = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES))->get_layout().get_shape(); - auto token_size = input_shape[0];// * max_topk; + auto token_size = input_shape[0]; // * max_topk; auto [local_threads_count, batches_per_thread, _] = calc_thread_count(const_cast(*instance.get_impl_params()), 4, _hidden_size); - #if DEBUG_MOE_LOG +# if DEBUG_MOE_LOG std::cout << "\nstep 6: prefill_scatter_reduce token_size=" << token_size << ", local_threads_count=" << local_threads_count << ", num_actually_used_experts = " << num_actually_used_experts << std::endl; - #endif +# endif ret_event = execute_stage({ret_event}, instance, @@ -1550,9 +1540,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {num_actually_used_experts}); // TODO: remove this sync which maybe lead to output is incorrect stream.finish(); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, intermediates_memories[3], "scatter_reduce_input"); print_mem(stream, batch_mem_ptr, "scatter_reduce_experts_per_token"); print_mem_f16(stream, routing_mem_ptr, "scatter_reduce_expert_weights"); @@ -1562,7 +1552,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { print_mem(stream, intermediates_memories[9], "scatter_reduce_expert_id", num_actually_used_experts); print_mem_f16(stream, final_hidden_states_mem_ptr, "final_hidden_states"); } - #endif +# endif } return ret_event; @@ -1656,83 +1646,23 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { return *_kernels.get(key); } - // inputs 0 is hidden_states, inputs 1 is router_logits[num_tokens, NUM_EXPERTS=128] - // extra step Softmax_TopK is fused to give topk-id & router_weights - // - // scratch.topk_id, scratch.full_router_weights = Softmax_TopK(router_logits) - // - // generate expert_mask from topk-id - // expert_mask.batch[i][j] : j'th token index for i'th expert - // expert_mask.topk[i][j] : topk-output offset for j'th token for i'th expert, used to get weights - // expert_mask.pred_flag[i]: bool, if expert i can be skipped - // - // - // scratch.x, scratch.routing_weights = gather(hidden_states, scratch.full_router_weights, expert_mask.batch, expert_mask.topk) - // scratch.y = MLP(scratch.x, .gate/up/down) * scratch.routing_weights - // scatter(final_hidden, scratch.y, expert_mask.batch) - // - cldnn::event::ptr execute(const std::vector& events, cldnn::primitive_inst& ins) override { - OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("moe_3gemm_swiglu_opt_impl::execute")); - auto& instance = reinterpret_cast&>(ins); + cldnn::event::ptr exec_prefill_onednn(const std::vector& events, + cldnn::stream& stream, + typed_primitive_inst& instance, + scratch_buffers& scratch, + expert_mask_cpu& expert_mask) { auto cur_moe = instance.get_typed_desc(); const auto& config = cur_moe->_config; - int max_topk = static_cast(config.top_k); - auto& cur_net = instance.get_network(); - auto& stream = cur_net.get_stream(); + auto& dnn_stream = stream.get_onednn_stream(); + cldnn::event::ptr result_event = nullptr; auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOE3GemmInputIndex::HIDDEN_STATES)); - auto batch = static_cast(hidden_states_layout.get_shape()[0]); - - scratch_buffers scratch; - prepare_internal_buffers(instance, scratch, batch); - - // softmax+topk - auto lws_size = cur_moe->_config.num_expert; - auto topk_event = execute_stage(events, - instance, - *softmax_topk, - {instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ROUTING_WEIGHTS))}, - {scratch.topk_id, scratch.topk_weights}, - {static_cast(batch), lws_size}, - {1, lws_size}); - - // Single batch is a special case, we don't need to do gather/scatter, - // and we can apply optimal kernels against memory bound to improve performance. - // It is very important for MoE's second token performance. - if (batch == 1) { - return exec_single_batch({topk_event}, instance, scratch); - } - auto& engine = instance.get_network().get_engine(); init_dnnl_weights(cur_moe, engine, scratch.moe_fusion_wei_addr); - auto final_hidden_states_mem_ptr = instance.output_memory_ptr(0); - auto final_hidden_states_layout = instance.get_output_layout(0); - - // onednn path will accumulate to the output - final_hidden_states_mem_ptr->fill(stream, false); - - // Wait for topk is ready - topk_event->wait(); - // [batch, max_topk] - auto topk_id_mem = scratch.topk_id; - - expert_mask_cpu expert_mask; - get_expert_mask_from_gpu(config, topk_id_mem, stream, expert_mask); - - GPU_DEBUG_TRACE_DETAIL << "\nMoE3GemmFusedCompressed exec(): batch=" << batch << ", max_topk=" << max_topk - << ", use_micro_gemm_prefill=" << use_micro_gemm_prefill << std::endl; - if (use_micro_gemm_prefill) { - GPU_DEBUG_TRACE_DETAIL << "\nUse micro_gemm prefill path" << std::endl; - update_rt_params(instance); - return exec_prefill_opt({topk_event}, instance, scratch, expert_mask); - } else { - GPU_DEBUG_TRACE_DETAIL << "\nUse onednn path" << std::endl; - } - - auto& dnn_stream = stream.get_onednn_stream(); - cldnn::event::ptr result_event = nullptr; + // auto final_hidden_states_layout = instance.get_output_layout(0); auto routing_mem_ptr = scratch.topk_weights; + auto final_hidden_states_mem_ptr = instance.output_memory_ptr(0); auto get_best_lws = [](size_t hidden_size) { const size_t candidate[] = {128, 64, 32, 16, 8}; for (size_t i = 0; i < sizeof(candidate) / sizeof(size_t); i++) { @@ -1742,11 +1672,9 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } OPENVINO_THROW("hidden_size=", hidden_size, " is not divisible by any of ", sizeof(candidate) / sizeof(size_t), " candidates"); }; - lws_size = get_best_lws(_hidden_size); + auto lws_size = get_best_lws(_hidden_size); + int max_topk = static_cast(config.top_k); - if (batch <= 1) { - OPENVINO_THROW("batch size should be > 1 for this path!"); - } for (size_t expert_no = 0; expert_no < config.num_expert; expert_no++) { if (expert_no >= expert_mask.pred_flag.size()) { OPENVINO_THROW("expert_no=", expert_no, " is out of bounds"); @@ -1774,17 +1702,17 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {1, lws_size}, instance.needs_completion_event()); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { // debug print std::cout << "expert_no=" << expert_no << ", n_token=" << n_token << ", hidden_size=" << _hidden_size << ", intermediate_size=" << _intermediate_size << std::endl; - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, hidden_states_mem_ptr, "input_token"); print_mem_f16(stream, scratch.x, "gathered_token", n_token); print_mem_f16(stream, scratch.routing_weights, "routing_weights"); } - #endif +# endif // up kernel.up.forward(dnn_stream, @@ -1793,13 +1721,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), dnnl::memory()); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { // debug print - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, scratch.up, "up_output", n_token); } - #endif +# endif // gate kernel.gate.forward(dnn_stream, @@ -1808,13 +1736,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab)); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { // debug print - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, scratch.gate, "gate_up_output", n_token); } - #endif +# endif // down kernel.down.forward(dnn_stream, @@ -1823,13 +1751,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.y, {static_cast(n_token), _hidden_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.routing_weights, {static_cast(n_token * max_topk)}, dnnl::memory::format_tag::a)); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { // debug print - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, scratch.y, "down_with_weights_output", n_token); } - #endif +# endif // index_add result_event = execute_stage({result_event}, @@ -1840,17 +1768,89 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(n_token), static_cast(_hidden_size)}, {1, lws_size}, instance.needs_completion_event()); - #if DUMP_TENSOR_CONTENTS +# if DUMP_TENSOR_CONTENTS { // debug print - stream.finish(); //debug + stream.finish(); // debug print_mem_f16(stream, final_hidden_states_mem_ptr, "final_output"); } - #endif +# endif } return result_event; } + + // inputs 0 is hidden_states, inputs 1 is router_logits[num_tokens, NUM_EXPERTS=128] + // extra step Softmax_TopK is fused to give topk-id & router_weights + // + // scratch.topk_id, scratch.full_router_weights = Softmax_TopK(router_logits) + // + // generate expert_mask from topk-id + // expert_mask.batch[i][j] : j'th token index for i'th expert + // expert_mask.topk[i][j] : topk-output offset for j'th token for i'th expert, used to get weights + // expert_mask.pred_flag[i]: bool, if expert i can be skipped + // + // + // scratch.x, scratch.routing_weights = gather(hidden_states, scratch.full_router_weights, expert_mask.batch, expert_mask.topk) + // scratch.y = MLP(scratch.x, .gate/up/down) * scratch.routing_weights + // scatter(final_hidden, scratch.y, expert_mask.batch) + // + cldnn::event::ptr execute(const std::vector& events, cldnn::primitive_inst& ins) override { + OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("moe_3gemm_swiglu_opt_impl::execute")); + auto& instance = reinterpret_cast&>(ins); + auto cur_moe = instance.get_typed_desc(); + const auto& config = cur_moe->_config; + auto& cur_net = instance.get_network(); + auto& stream = cur_net.get_stream(); + + auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOE3GemmInputIndex::HIDDEN_STATES)); + auto batch = static_cast(hidden_states_layout.get_shape()[0]); + + scratch_buffers scratch; + prepare_internal_buffers(instance, scratch, batch); + + // softmax+topk + auto lws_size = config.num_expert; + auto topk_event = execute_stage(events, + instance, + *softmax_topk, + {instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ROUTING_WEIGHTS))}, + {scratch.topk_id, scratch.topk_weights}, + {static_cast(batch), lws_size}, + {1, lws_size}); + + // Single batch is a special case, we don't need to do gather/scatter, + // and we can apply optimal kernels against memory bound to improve performance. + // It is very important for MoE's second token performance. + if (batch == 1) { + return exec_single_batch({topk_event}, instance, scratch); + } + + // onednn path will accumulate to the output + if (!use_micro_gemm_prefill) { + auto final_hidden_states_mem_ptr = instance.output_memory_ptr(0); + final_hidden_states_mem_ptr->fill(stream, false); + } + + // Wait for topk is ready + topk_event->wait(); + // [batch, max_topk] + auto topk_id_mem = scratch.topk_id; + + expert_mask_cpu expert_mask; + get_expert_mask_from_gpu(config, topk_id_mem, stream, expert_mask); + + GPU_DEBUG_TRACE_DETAIL << "\nMoE3GemmFusedCompressed exec(): batch=" << batch << ", max_topk=" << static_cast(config.top_k) + << ", use_micro_gemm_prefill=" << use_micro_gemm_prefill << std::endl; + if (use_micro_gemm_prefill) { + GPU_DEBUG_TRACE_DETAIL << "\nUse micro_gemm prefill path" << std::endl; + update_rt_params(instance); + return exec_prefill_micro_gemm({topk_event}, instance, scratch, expert_mask); + } + + GPU_DEBUG_TRACE_DETAIL << "\nUse onednn path" << std::endl; + return exec_prefill_onednn({topk_event}, stream, instance, scratch, expert_mask); + } }; } // namespace From b9162a5c454e56b51bdc7793b9d00c6854445640 Mon Sep 17 00:00:00 2001 From: River Date: Wed, 3 Dec 2025 14:58:06 +0800 Subject: [PATCH 15/20] Update internal buffer --- .../graph/impls/ocl_v2/moe/moe_3gemm_base.hpp | 16 +- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 54 +++---- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 147 ++++++++++-------- 3 files changed, 124 insertions(+), 93 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp index c7a2d26b6f2924..836b66c851a2e0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp @@ -13,9 +13,19 @@ namespace ov::intel_gpu::ocl { -// mlp_gate: 0 -// mlp_up: 1 -// mlp_down: 2 +#define MOE_INTERNAL_BUFFER_TOPK_IDX 0 // topk_idx +#define MOE_INTERNAL_BUFFER_TOPK_WEIGHTS 1 // topk_weights +#define MOE_INTERNAL_BUFFER_UP_OUTPUT 2 // up output +#define MOE_INTERNAL_BUFFER_DOWN_OUTPUT 3 // down output +#define MOE_INTERNAL_BUFFER_GATE_UP_INPUT 4 // gather input tensor +#define MOE_INTERNAL_BUFFER_ROUTING_WEIGHTS 5 // routing_weights +#define MOE_INTERNAL_BUFFER_GATE_OUTPUT 6 // gate output +#define MOE_INTERNAL_BUFFER_EXPERT_MASK_BATCH 7 // expert_mask_batch +#define MOE_INTERNAL_BUFFER_EXPERT_MASK_TOPK 8 // expert_mask_topk +#define MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS 9 // experts_ids for each activated expert +#define MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT 10 // token start offset idx (input gather tokens) for each activated expert +#define MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT 11 // token len (input gather tokens) for each activated expert +#define MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT 12 // token idx per expert enum class MoE3GemmMicroKernelType : uint8_t { MLP_GATE = 0, MLP_UP = 1, MLP_DOWN = 2 }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index e6951764c56f80..58740aa0691cc3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -321,40 +321,40 @@ Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& p switch (m_type) { case MoE3GemmMicroKernelType::MLP_GATE: - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_GATE_UP_INPUT}); // gather input tensor args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_0)}); - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // gate output - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len - args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m - args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_0)}); // zp + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_GATE_OUTPUT}); // gate output + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS}); // experts_ids + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT}); // input_offset_per_expert + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT}); // n_array - token len + args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m + args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_0)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_0)}); // zp break; case MoE3GemmMicroKernelType::MLP_UP: - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4}); // gather input tensor + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_GATE_UP_INPUT}); // gather input tensor args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_1)}); - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); // up output - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len - args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m - args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_1)}); // zp + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_UP_OUTPUT}); // up output + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS}); // experts_ids + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT}); // input_offset_per_expert + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT}); // n_array - token len + args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m + args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_1)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_1)}); // zp break; case MoE3GemmMicroKernelType::MLP_DOWN: - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 6}); // intermediate_mem[6] + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_GATE_OUTPUT}); // intermediate_mem[6] args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::WEIGHT_2)}); - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3}); // down output - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 9}); // experts_ids - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 10}); // input_offset_per_expert - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 11}); // n_array - token len - args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m - args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale - args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_2)}); // zp + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_DOWN_OUTPUT}); // down output + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS}); // experts_ids + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT}); // input_offset_per_expert + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT}); // n_array - token len + args.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // m + args.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // k + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::SCALE_2)}); // scale + args.push_back({ArgumentDescriptor::Types::INPUT, static_cast(MOE3GemmInputIndex::ZP_2)}); // zp break; default: OPENVINO_THROW("Unsupported MoE3GemmMicroKernelType"); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index d394af6188da14..d7079131ee65ac 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -258,12 +258,10 @@ std::shared_ptr make_cacheable(dnnl::engine eng, CArgs... cargs) { sptr = wptr.lock(); if (!sptr) { sptr = std::make_shared(eng, cargs...); - // ECOUT("make_cacheable re-constructed: ", typeid(T).name(), "(", cargs..., ")"); wptr = sptr; } } else { sptr = std::make_shared(eng, cargs...); - // ECOUT("make_cacheable constructed: ", typeid(T).name(), "(", cargs..., ")"); cache.emplace(std::make_pair(key, std::weak_ptr(sptr))); } return sptr; @@ -398,7 +396,7 @@ class MoE3GemmSwigluGather : public KernelGenerator { } }; -static size_t GetBlockSize(const RuntimeParams& params) { +static size_t get_vec_size(const RuntimeParams& params) { const auto& input = params.get_input_layout(0); size_t vec_size = 1; switch (input.data_type) { @@ -457,7 +455,7 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { // const auto& info = engine.get_device_info(); auto hidden_size = desc->_config.hidden_size; - auto block_size = GetBlockSize(params); + auto block_size = get_vec_size(params); auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(params), block_size, hidden_size); jit.make("HIDDEN_SIZE", hidden_size); @@ -470,9 +468,9 @@ class MoE3GemmSwigluPrefillGather : public KernelGenerator { jit.make("OUTPUT_TYPE", "half"); jit.make("OPTIONAL_SHAPE_INFO_ARG", ""); - // std::cout << "MoE3GemmSwigluPrefillGather::get_jit_constants(): hidden_size: " << hidden_size << ", block_size: " << block_size - // << ", local_threads_count: " << local_threads_count << ", batches_per_thread: " << batches_per_thread - // << ", unaligned_elements: " << unaligned_elements << std::endl; + GPU_DEBUG_TRACE_DETAIL << "MoE3GemmSwigluPrefillGather::get_jit_constants(): hidden_size: " << hidden_size << ", block_size: " << block_size + << ", local_threads_count: " << local_threads_count << ", batches_per_thread: " << batches_per_thread + << ", unaligned_elements: " << unaligned_elements << std::endl; return jit; } @@ -885,6 +883,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { return cur_moe; } + // Notice: don't change the order of internal buffers, it is defined in MOE3GemmInternalBufferIdx std::vector get_internal_buffer_descs(const kernel_impl_params& params) const override { auto cur_moe = params.typed_desc(); const auto& config = cur_moe->_config; @@ -900,7 +899,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { layout layout_topk_weights(ov::Shape{batch, max_topk}, data_type, cldnn::format::bfyx); internal_buffers.emplace_back(layout_topk_id, true); // 0: topk_id internal_buffers.emplace_back(layout_topk_weights, true); // 1: topk_weights - // fast single batch: scratch.up = up(x) * silu(gate(x)); scratch.y = down(scratch.up) * weight[expert_no] + // To support micro_gemm, prefill need to allocate max_topk * batch for input data of micro_gemm auto max_batch = max_topk * batch; layout layout_gateup_out(ov::Shape{max_batch, static_cast(config.inter_size)}, data_type, cldnn::format::bfyx); @@ -917,8 +916,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { internal_buffers.emplace_back(layout_gateup_out, true); // 6: gate output, scratch.gate has same layout with up // expert masks for gpu layout index_layout(ov::Shape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx); - internal_buffers.emplace_back(index_layout, true); // 7: batch - internal_buffers.emplace_back(index_layout, true); // 8: topk + internal_buffers.emplace_back(index_layout, true); // 7: expert_mask_batch + internal_buffers.emplace_back(index_layout, true); // 8: expert_mask_topk GPU_DEBUG_TRACE_DETAIL << "[DEBUG] get_internal_buffer_descs(): use_micro_gemm_prefill=" << use_micro_gemm_prefill << std::endl; // for micro_gemm @@ -936,21 +935,23 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { void prepare_internal_buffers(typed_primitive_inst& instance, scratch_buffers& scratch, size_t batch) { const auto& intermediates_memories = instance.get_intermediates_memories(); auto& engine = instance.get_network().get_engine(); - scratch.topk_id = intermediates_memories[0]; - scratch.topk_weights = intermediates_memories[1]; - scratch.up = intermediates_memories[2]; - scratch.y = intermediates_memories[3]; + scratch.topk_id = intermediates_memories[MOE_INTERNAL_BUFFER_TOPK_IDX]; + scratch.topk_weights = intermediates_memories[MOE_INTERNAL_BUFFER_TOPK_WEIGHTS]; + scratch.up = intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT]; + scratch.y = intermediates_memories[MOE_INTERNAL_BUFFER_DOWN_OUTPUT]; if (batch > 1) { - scratch.x = intermediates_memories[4]; - scratch.routing_weights = intermediates_memories[5]; - scratch.gate = intermediates_memories[6]; + scratch.x = intermediates_memories[MOE_INTERNAL_BUFFER_GATE_UP_INPUT]; + scratch.routing_weights = intermediates_memories[MOE_INTERNAL_BUFFER_ROUTING_WEIGHTS]; + scratch.gate = intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]; const auto& config = instance.get_typed_desc()->_config; int expert_num = static_cast(config.num_expert); scratch.expert_masks.resize(expert_num); for (int i = 0; i < expert_num; i++) { auto mask_layout = cldnn::layout({static_cast(batch)}, cldnn::data_types::i32, cldnn::format::get_default_format(1)); - scratch.expert_masks[i].batch = engine.create_subbuffer(*intermediates_memories[7], mask_layout, i * batch * sizeof(int32_t)); - scratch.expert_masks[i].topk = engine.create_subbuffer(*intermediates_memories[8], mask_layout, i * batch * sizeof(int32_t)); + scratch.expert_masks[i].batch = + engine.create_subbuffer(*intermediates_memories[MOE_INTERNAL_BUFFER_EXPERT_MASK_BATCH], mask_layout, i * batch * sizeof(int32_t)); + scratch.expert_masks[i].topk = + engine.create_subbuffer(*intermediates_memories[MOE_INTERNAL_BUFFER_EXPERT_MASK_TOPK], mask_layout, i * batch * sizeof(int32_t)); } } @@ -1303,10 +1304,15 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // mask 2: token len (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] // mask 3: expert id, dynamic shape = [activated_expert_num] { - cldnn::mem_lock tokens_per_expert_lock(intermediates_memories[12], stream); - cldnn::mem_lock experts_info_start_idx_lock(intermediates_memories[10], stream); - cldnn::mem_lock experts_id_lock(intermediates_memories[9], stream); - cldnn::mem_lock tokens_lens_per_expert_lock(intermediates_memories[11], stream); + cldnn::mem_lock tokens_per_expert_lock(intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], + stream); + cldnn::mem_lock experts_info_start_idx_lock( + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], + stream); + cldnn::mem_lock experts_id_lock(intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], stream); + cldnn::mem_lock tokens_lens_per_expert_lock( + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], + stream); int tokens_per_expert_iter = 0; int experts_id_iter = 0; @@ -1366,10 +1372,10 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // 0: gathered token: shape = [token_len * expert_topK, hidden_size] { auto hidden_size = _hidden_size; - auto block_size = GetBlockSize(*instance.get_impl_params()); + auto block_size = get_vec_size(*instance.get_impl_params()); auto [local_threads_count, batches_per_thread, unaligned_elements] = calc_thread_count(const_cast(*instance.get_impl_params()), block_size, hidden_size); - auto token_per_expert = intermediates_memories[12]->get_layout().get_shape()[0]; + auto token_per_expert = intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT]->get_layout().get_shape()[0]; # if DEBUG_MOE_LOG std::cout << "\nstep 2: prefill_gather local_threads_count=" << local_threads_count << ", batches_per_thread=" << batches_per_thread @@ -1379,7 +1385,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { ret_event = execute_stage(events, instance, *prefill_gather, - {instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), intermediates_memories[12]}, + {instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT]}, {scratch.x}, {static_cast(token_per_expert * local_threads_count), 1, 1}, {static_cast(local_threads_count), 1, 1}); @@ -1388,7 +1395,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { { stream.finish(); // debug // print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); - // print_mem(stream, intermediates_memories[12], "token idx per expert"); + // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], "token idx per expert"); // print_mem_f16(stream, scratch.x, "gathered token"); std::cout << std::endl; } @@ -1417,11 +1424,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { # if DUMP_TENSOR_CONTENTS { stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[4], "up_token_input"); - print_mem(stream, intermediates_memories[9], "up_expert_id", num_actually_used_experts); - print_mem(stream, intermediates_memories[10], "up_input_offset_per_expert", num_actually_used_experts); - print_mem(stream, intermediates_memories[11], "up_token_len", num_actually_used_experts); - print_mem_f16(stream, intermediates_memories[2], "up_output"); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_UP_INPUT], "up_token_input"); + print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "expert_id", num_actually_used_experts); + print_mem(stream, + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], + "input_offset_per_expert", + num_actually_used_experts); + print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], "token_len", num_actually_used_experts); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT], "up_output"); } # endif @@ -1430,11 +1440,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { # if DUMP_TENSOR_CONTENTS { stream.finish(); // debug - // print_mem_f16(stream, intermediates_memories[4], "gate_token_input"); - // print_mem(stream, intermediates_memories[9], "gate_expert_id", num_actually_used_experts); - // print_mem(stream, intermediates_memories[10], "gate_input_offset_per_expert", num_actually_used_experts); - // print_mem(stream, intermediates_memories[11], "gate_token_len", num_actually_used_experts); - print_mem_f16(stream, intermediates_memories[6], "gate_output"); + // print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_UP_INPUT], "gate_token_input"); + // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "gate_expert_id", num_actually_used_experts); + // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], "gate_input_offset_per_expert", + // num_actually_used_experts); print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], + // "gate_token_len", num_actually_used_experts); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT], "gate_output"); } # endif } @@ -1453,20 +1464,21 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::cout << "\nstep 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _intermediate_size << std::endl; # endif - ret_event = execute_stage({ret_event}, - instance, - *prefill_swiglu, - {intermediates_memories[2], intermediates_memories[6]}, - {intermediates_memories[6]}, - {static_cast(token_size), static_cast(_intermediate_size), 1}, - {1, subgroup_size, 1}); + ret_event = + execute_stage({ret_event}, + instance, + *prefill_swiglu, + {intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT], intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]}, + {intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]}, + {static_cast(token_size), static_cast(_intermediate_size), 1}, + {1, subgroup_size, 1}); # if DUMP_TENSOR_CONTENTS { ret_event->wait(); // debug stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[2], "silu_up_input"); - print_mem_f16(stream, intermediates_memories[6], "silu_gate_up_output"); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT], "silu_up_input"); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT], "silu_gate_up_output"); } # endif } @@ -1493,11 +1505,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { # if DUMP_TENSOR_CONTENTS { stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[6], "down_token_input"); - // print_mem(stream, intermediates_memories[9], "down_expert_id", num_actually_used_experts); - // print_mem(stream, intermediates_memories[10], "down_input_offset_per_expert", num_actually_used_experts); - // print_mem(stream, intermediates_memories[11], "down_token_len", num_actually_used_experts); - print_mem_f16(stream, intermediates_memories[3], "down_output"); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_UP_INPUT], "down_token_input"); + // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "down_expert_id", num_actually_used_experts); + // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], "down_input_offset_per_expert", + // num_actually_used_experts); print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], + // "down_token_len", num_actually_used_experts); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_DOWN_OUTPUT], "down_output"); } # endif } @@ -1526,13 +1539,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { ret_event = execute_stage({ret_event}, instance, *prefill_scatter_reduce, - {intermediates_memories[3], + {intermediates_memories[MOE_INTERNAL_BUFFER_DOWN_OUTPUT], batch_mem_ptr, routing_mem_ptr, - intermediates_memories[12], - intermediates_memories[10], - intermediates_memories[11], - intermediates_memories[9]}, + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], + intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS]}, {final_hidden_states_mem_ptr}, {static_cast(token_size * local_threads_count), 1, 1}, {local_threads_count, 1, 1}, @@ -1543,13 +1556,22 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { # if DUMP_TENSOR_CONTENTS { stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[3], "scatter_reduce_input"); + print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_DOWN_OUTPUT], "scatter_reduce_input"); print_mem(stream, batch_mem_ptr, "scatter_reduce_experts_per_token"); print_mem_f16(stream, routing_mem_ptr, "scatter_reduce_expert_weights"); - print_mem(stream, intermediates_memories[12], "scatter_reduce_tokens_per_expert"); - print_mem(stream, intermediates_memories[10], "scatter_reduce_experts_start_offset", num_actually_used_experts); - print_mem(stream, intermediates_memories[11], "scatter_reduce_tokens_len_per_expert", num_actually_used_experts); - print_mem(stream, intermediates_memories[9], "scatter_reduce_expert_id", num_actually_used_experts); + print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], "scatter_reduce_tokens_per_expert"); + print_mem(stream, + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], + "scatter_reduce_experts_start_offset", + num_actually_used_experts); + print_mem(stream, + intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], + "scatter_reduce_tokens_len_per_expert", + num_actually_used_experts); + print_mem(stream, + intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], + "scatter_reduce_expert_id", + num_actually_used_experts); print_mem_f16(stream, final_hidden_states_mem_ptr, "final_hidden_states"); } # endif @@ -1843,12 +1865,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { GPU_DEBUG_TRACE_DETAIL << "\nMoE3GemmFusedCompressed exec(): batch=" << batch << ", max_topk=" << static_cast(config.top_k) << ", use_micro_gemm_prefill=" << use_micro_gemm_prefill << std::endl; if (use_micro_gemm_prefill) { - GPU_DEBUG_TRACE_DETAIL << "\nUse micro_gemm prefill path" << std::endl; update_rt_params(instance); return exec_prefill_micro_gemm({topk_event}, instance, scratch, expert_mask); } - GPU_DEBUG_TRACE_DETAIL << "\nUse onednn path" << std::endl; + // fallback to onednn path return exec_prefill_onednn({topk_event}, stream, instance, scratch, expert_mask); } }; From 12eb8494d97dda4d296dd27eed6108838ca99fd9 Mon Sep 17 00:00:00 2001 From: River Date: Wed, 3 Dec 2025 15:30:45 +0800 Subject: [PATCH 16/20] Fix OCL build error caused by add_stage order --- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index d7079131ee65ac..e6649c385024e0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -793,7 +793,10 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { << ", arch=" << static_cast(info.arch) << std::endl; } + // Don't change the order of stages add_stage(softmax_topk, params); + add_stage(gather, params); + add_stage(scatter, params); add_stage(mlp_gate_up, params); add_stage(mlp_down, params); add_stage(mlp_reduce, params); @@ -804,9 +807,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { add_stage(prefill_swiglu, params); add_stage(micro_gemm_down, params); add_stage(prefill_scatter_reduce, params); - } else { - add_stage(gather, params); - add_stage(scatter, params); } } @@ -1304,8 +1304,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // mask 2: token len (input gather tokens) for each activated expert, dynamic shape = [activated_expert_num] // mask 3: expert id, dynamic shape = [activated_expert_num] { - cldnn::mem_lock tokens_per_expert_lock(intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], - stream); + cldnn::mem_lock tokens_per_expert_lock(intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], stream); cldnn::mem_lock experts_info_start_idx_lock( intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], stream); @@ -1464,14 +1463,13 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::cout << "\nstep 4: prefill_swiglu token_size=" << token_size << ", hidden_size=" << _intermediate_size << std::endl; # endif - ret_event = - execute_stage({ret_event}, - instance, - *prefill_swiglu, - {intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT], intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]}, - {intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]}, - {static_cast(token_size), static_cast(_intermediate_size), 1}, - {1, subgroup_size, 1}); + ret_event = execute_stage({ret_event}, + instance, + *prefill_swiglu, + {intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT], intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]}, + {intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]}, + {static_cast(token_size), static_cast(_intermediate_size), 1}, + {1, subgroup_size, 1}); # if DUMP_TENSOR_CONTENTS { @@ -1568,10 +1566,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], "scatter_reduce_tokens_len_per_expert", num_actually_used_experts); - print_mem(stream, - intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], - "scatter_reduce_expert_id", - num_actually_used_experts); + print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "scatter_reduce_expert_id", num_actually_used_experts); print_mem_f16(stream, final_hidden_states_mem_ptr, "final_hidden_states"); } # endif From 02655c7a5f2bc2f98197efe05e1a764f507ff2c5 Mon Sep 17 00:00:00 2001 From: River Date: Wed, 3 Dec 2025 20:21:53 +0800 Subject: [PATCH 17/20] Improve prefill performance --- .../src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index e6649c385024e0..a0e32a430d50af 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -1550,7 +1550,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { instance.needs_completion_event(), {num_actually_used_experts}); // TODO: remove this sync which maybe lead to output is incorrect - stream.finish(); + // stream.finish(); # if DUMP_TENSOR_CONTENTS { stream.finish(); // debug diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl index 0ab78a37340669..4dff6e5490ff4d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_3gemm_swiglu_fuse.cl @@ -83,7 +83,7 @@ KERNEL (gather_2d_ref)( dst_tok += k * HIDDEN_SIZE; if (off >= HIDDEN_SIZE) { - printf("Warning off >= HIDDEN_SIZE: k = %d, off = %d, HIDDEN_SIZE = %d\n", k, off, HIDDEN_SIZE); + // printf("Warning off >= HIDDEN_SIZE: k = %d, off = %d, HIDDEN_SIZE = %d\n", k, off, HIDDEN_SIZE); return; } From c9926175cddd0fb9284fa44052316d73d839e169 Mon Sep 17 00:00:00 2001 From: River Date: Thu, 4 Dec 2025 14:56:26 +0800 Subject: [PATCH 18/20] Remove debug code --- .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 227 +----------------- 1 file changed, 1 insertion(+), 226 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index a0e32a430d50af..7099019c6615f0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -7,8 +7,7 @@ #include "moe_3gemm_swiglu_opt.hpp" // clang-format on -#define DUMP_TENSOR_CONTENTS 0 -#define DEBUG_MOE_LOG 0 +#define DEBUG_MOE_LOG 0 #ifdef ENABLE_ONEDNN_FOR_GPU # include @@ -1169,108 +1168,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { return ret; } -# if DUMP_TENSOR_CONTENTS - void print_mem_f16(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, size_t max_row = 50) { - auto layout = mem->get_layout().get_shape(); - size_t row = 0; - size_t col = 0; - - switch (layout.size()) { - case 1: - row = 1; - col = layout[0]; - break; - case 2: - row = layout[0]; - col = layout[1]; - break; - case 3: - row = layout[0] * layout[1]; - col = layout[2]; - break; - case 4: - row = layout[0] * layout[1] * layout[2]; - col = layout[3]; - break; - default: - OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); - } - - cldnn::mem_lock lock_data{mem, stream}; - std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; - for (size_t j = 0; j < row && j < max_row; j++) { - std::cout << "\t[" << j << "]: "; - for (size_t i = 0; i < col && i < 16; i++) { - ov::float16 v = ov::float16::from_bits(lock_data[j * col + i]); - std::cout << static_cast(v) << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - }; - - void print_mem_u4(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, size_t max_row = 50) { - auto layout = mem->get_layout().get_shape(); - size_t row = 0; - size_t col = 0; - - switch (layout.size()) { - case 1: - row = 1; - col = layout[0]; - break; - case 2: - row = layout[0]; - col = layout[1]; - break; - case 3: - row = layout[0] * layout[1]; - col = layout[2]; - break; - case 4: - row = layout[0] * layout[1] * layout[2]; - col = layout[3]; - break; - default: - OPENVINO_THROW("print_mem_f16 not support layout size ", layout.size()); - } - - col = col / 2; // u4 - cldnn::mem_lock lock_data{mem, stream}; - std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; - for (size_t j = 0; j < row && j < max_row; j++) { - std::cout << "\t[" << j << "]: "; - for (size_t i = 0; i < col && i < 16; i++) { - uint8_t byte_val = lock_data[j * col + i]; - std::cout << (byte_val & 0xF) << ", " << ((byte_val >> 4) & 0xF) << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - }; - - void print_mem(cldnn::stream& stream, memory::ptr mem, const std::string& mem_name, int max_print = 1024) { - auto layout = mem->get_layout().get_shape(); - size_t row = layout.size() >= 2 ? layout[layout.size() - 2] : 1; - size_t col = layout.size() >= 2 ? layout[layout.size() - 1] : layout[0]; - cldnn::mem_lock lock_data{mem, stream}; - std::cout << mem_name << ": layout = " << mem->get_layout().to_short_string() << std::endl; - int print_cnt = 0; - for (size_t j = 0; j < row; j++) { - std::cout << "\t[" << j << "]: "; - for (size_t i = 0; i < col; i++) { - if (print_cnt++ >= max_print) { - std::cout << "..." << std::endl; - return; - } - std::cout << lock_data[j * col + i] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - }; -# endif - cldnn::event::ptr exec_prefill_micro_gemm(const std::vector& events, typed_primitive_inst& instance, scratch_buffers& scratch, @@ -1389,16 +1286,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {scratch.x}, {static_cast(token_per_expert * local_threads_count), 1, 1}, {static_cast(local_threads_count), 1, 1}); - -# if DUMP_TENSOR_CONTENTS - { - stream.finish(); // debug - // print_mem_f16(stream, instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::HIDDEN_STATES)), "input token"); - // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], "token idx per expert"); - // print_mem_f16(stream, scratch.x, "gathered token"); - std::cout << std::endl; - } -# endif } // step 3: moe_gemm for up and gate @@ -1419,34 +1306,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::cout << "\nstep 3: moe_gemm for up and gate" << std::endl; # endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_up); - -# if DUMP_TENSOR_CONTENTS - { - stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_UP_INPUT], "up_token_input"); - print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "expert_id", num_actually_used_experts); - print_mem(stream, - intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], - "input_offset_per_expert", - num_actually_used_experts); - print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], "token_len", num_actually_used_experts); - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT], "up_output"); - } -# endif - ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_gate); - -# if DUMP_TENSOR_CONTENTS - { - stream.finish(); // debug - // print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_UP_INPUT], "gate_token_input"); - // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "gate_expert_id", num_actually_used_experts); - // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], "gate_input_offset_per_expert", - // num_actually_used_experts); print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], - // "gate_token_len", num_actually_used_experts); - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT], "gate_output"); - } -# endif } // step 4: post proc - gate_up = silu(gate)*up, silu(x)=x*sigmod(x)=x*(1+exp(-x)) @@ -1470,15 +1330,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT]}, {static_cast(token_size), static_cast(_intermediate_size), 1}, {1, subgroup_size, 1}); - -# if DUMP_TENSOR_CONTENTS - { - ret_event->wait(); // debug - stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_UP_OUTPUT], "silu_up_input"); - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_OUTPUT], "silu_gate_up_output"); - } -# endif } // step 5: moe_gemm for down @@ -1499,18 +1350,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { std::cout << "\nstep 5: moe_gemm for down" << std::endl; # endif ret_event = PrimitiveImplOCL::execute_stage({ret_event}, instance, micro_gemm_down); - -# if DUMP_TENSOR_CONTENTS - { - stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_GATE_UP_INPUT], "down_token_input"); - // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "down_expert_id", num_actually_used_experts); - // print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], "down_input_offset_per_expert", - // num_actually_used_experts); print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], - // "down_token_len", num_actually_used_experts); - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_DOWN_OUTPUT], "down_output"); - } -# endif } // step 6: scatter and reduce @@ -1549,27 +1388,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {local_threads_count, 1, 1}, instance.needs_completion_event(), {num_actually_used_experts}); - // TODO: remove this sync which maybe lead to output is incorrect - // stream.finish(); -# if DUMP_TENSOR_CONTENTS - { - stream.finish(); // debug - print_mem_f16(stream, intermediates_memories[MOE_INTERNAL_BUFFER_DOWN_OUTPUT], "scatter_reduce_input"); - print_mem(stream, batch_mem_ptr, "scatter_reduce_experts_per_token"); - print_mem_f16(stream, routing_mem_ptr, "scatter_reduce_expert_weights"); - print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_IDX_PER_EXPERT], "scatter_reduce_tokens_per_expert"); - print_mem(stream, - intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_START_OFFSET_PER_EXPERT], - "scatter_reduce_experts_start_offset", - num_actually_used_experts); - print_mem(stream, - intermediates_memories[MOE_INTERNAL_BUFFER_TOKEN_LEN_PER_ACTIVATED_EXPERT], - "scatter_reduce_tokens_len_per_expert", - num_actually_used_experts); - print_mem(stream, intermediates_memories[MOE_INTERNAL_BUFFER_ACTIVATED_EXPERT_IDS], "scatter_reduce_expert_id", num_actually_used_experts); - print_mem_f16(stream, final_hidden_states_mem_ptr, "final_hidden_states"); - } -# endif } return ret_event; @@ -1719,18 +1537,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {1, lws_size}, instance.needs_completion_event()); -# if DUMP_TENSOR_CONTENTS - { - // debug print - std::cout << "expert_no=" << expert_no << ", n_token=" << n_token << ", hidden_size=" << _hidden_size - << ", intermediate_size=" << _intermediate_size << std::endl; - stream.finish(); // debug - print_mem_f16(stream, hidden_states_mem_ptr, "input_token"); - print_mem_f16(stream, scratch.x, "gathered_token", n_token); - print_mem_f16(stream, scratch.routing_weights, "routing_weights"); - } -# endif - // up kernel.up.forward(dnn_stream, n_token, @@ -1738,14 +1544,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), dnnl::memory()); -# if DUMP_TENSOR_CONTENTS - { - // debug print - stream.finish(); // debug - print_mem_f16(stream, scratch.up, "up_output", n_token); - } -# endif - // gate kernel.gate.forward(dnn_stream, n_token, @@ -1753,14 +1551,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab)); -# if DUMP_TENSOR_CONTENTS - { - // debug print - stream.finish(); // debug - print_mem_f16(stream, scratch.gate, "gate_up_output", n_token); - } -# endif - // down kernel.down.forward(dnn_stream, n_token, @@ -1768,14 +1558,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.y, {static_cast(n_token), _hidden_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.routing_weights, {static_cast(n_token * max_topk)}, dnnl::memory::format_tag::a)); -# if DUMP_TENSOR_CONTENTS - { - // debug print - stream.finish(); // debug - print_mem_f16(stream, scratch.y, "down_with_weights_output", n_token); - } -# endif - // index_add result_event = execute_stage({result_event}, instance, @@ -1785,13 +1567,6 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { {static_cast(n_token), static_cast(_hidden_size)}, {1, lws_size}, instance.needs_completion_event()); -# if DUMP_TENSOR_CONTENTS - { - // debug print - stream.finish(); // debug - print_mem_f16(stream, final_hidden_states_mem_ptr, "final_output"); - } -# endif } return result_event; From e8fbacf39cfee1b7b2725db95a46c38f7076825a Mon Sep 17 00:00:00 2001 From: River Date: Thu, 4 Dec 2025 22:10:25 +0800 Subject: [PATCH 19/20] Fixed long pipeline initialization latency issue --- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp | 41 ++++++++++++----- .../impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp | 45 ++++++++++++++++++- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp index 58740aa0691cc3..1d54330ede0dc0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.cpp @@ -142,15 +142,9 @@ static micro::Type convert_type(ov::element::Type t) { } std::mutex MoE3GemmMicroGenerator::mtx; +std::unordered_map MoE3GemmMicroGenerator::s_gemm_cache; void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, micro::Package& gemm_moe, MoE3GemmMicroKernelType type) noexcept { - // TODO: Remove once micro API is thread safe std::lock_guard l(mtx); - // auto moe_cfg = get_moe_cfg(params); - const auto& device_info = params.get_device_info(); - micro::HWInformation hw_info; - hw_info.euCount = device_info.execution_units_count; - hw_info.gmdid = device_info.ip_version; - hw_info.systolicAvailable = device_info.supports_immad; int wei_idx, scale_idx, zp_idx; switch (type) { @@ -174,6 +168,31 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, break; } + const auto& weight_layout = params.get_input_layout(wei_idx); + const auto& scale_layout = params.get_input_layout(scale_idx); + const auto& zp_layout = params.get_input_layout(zp_idx); + + MoE3GemmMicroGenerator::GemmCacheKey key; + key.weight_shape = weight_layout.get_shape(); + key.weight_dt = weight_layout.data_type; + key.scale_shape = scale_layout.get_shape(); + key.scale_dt = scale_layout.data_type; + key.zp_shape = zp_layout.get_shape(); + key.zp_dt = zp_layout.data_type; + + auto it = s_gemm_cache.find(key); + if (it != s_gemm_cache.end()) { + GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::init_microkernels: hit cache by layout\n"; + gemm_moe = it->second; + return; + } + + const auto& device_info = params.get_device_info(); + micro::HWInformation hw_info; + hw_info.euCount = device_info.execution_units_count; + hw_info.gmdid = device_info.ip_version; + hw_info.systolicAvailable = device_info.supports_immad; + // weight layout example: u4:bfyx:4x3072x8x128:nopad const auto& weight_shape = params.get_input_layout(wei_idx).get_shape(); const bool is_prefill = true; @@ -245,7 +264,10 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params, GPU_DEBUG_TRACE_DETAIL << "sizes to select gemm : m : " << m << " n : " << n << " k : " << k << std::endl; try { /* Ask microkernel provider for microkernel */ - gemm_moe = micro::select_gemm_microkernel(opts_moe, hw_info, sizes, problem_moe); + micro::Package pkg = micro::select_gemm_microkernel(opts_moe, hw_info, sizes, problem_moe); + s_gemm_cache.emplace(key, pkg); + gemm_moe = std::move(pkg); + GPU_DEBUG_TRACE_DETAIL << "MoE3GemmMicroGenerator::init_microkernels: create and cache new micro kernel" << std::endl; } catch (const std::runtime_error& ex) { OPENVINO_THROW("Can't create moe micro kernel: ", ex.what()); } @@ -314,9 +336,6 @@ std::string MoE3GemmMicroGenerator::get_build_options(const kernel_impl_params& Arguments MoE3GemmMicroGenerator::get_arguments_desc(const kernel_impl_params& params) const { Arguments args; - // if (params.is_dynamic()) - // args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); - // auto cfg = get_moe_cfg(params); auto desc = params.typed_desc(); switch (m_type) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp index 2a5ec5ed7832bd..b4e52172182c27 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_gen_micro.hpp @@ -66,7 +66,7 @@ class MoE3GemmMicroGenerator : public MoEGemmOptGeneratorBase { static const moe_3gemm_config get_moe_3gemm_cfg(const kernel_impl_params& params) { moe_3gemm_config cfg; auto desc = params.typed_desc(); - cfg.weight_group_size = desc->_config.group_size; + cfg.weight_group_size = static_cast(desc->_config.group_size); cfg.has_batch_dim = desc->_config.has_batch_dim; return cfg; } @@ -77,6 +77,49 @@ class MoE3GemmMicroGenerator : public MoEGemmOptGeneratorBase { int m_scale_idx; int m_zp_idx; static std::mutex mtx; + + struct GemmCacheKey { + ov::Shape weight_shape; + ov::element::Type weight_dt; + + ov::Shape scale_shape; + ov::element::Type scale_dt; + + ov::Shape zp_shape; + ov::element::Type zp_dt; + + bool operator==(const GemmCacheKey& other) const { + return weight_shape == other.weight_shape && weight_dt == other.weight_dt && scale_shape == other.scale_shape && scale_dt == other.scale_dt && + zp_shape == other.zp_shape && zp_dt == other.zp_dt; + } + }; + + struct GemmCacheKeyHash { + size_t operator()(const GemmCacheKey& k) const noexcept { + size_t h = 0; + + auto hash_combine = [](size_t& seed, size_t v) { + seed ^= v + 0x9e3779b9 + (seed << 6) + (seed >> 2); + }; + + auto hash_shape = [&](const ov::Shape& s) { + for (auto v : s) { + hash_combine(h, std::hash()(v)); + } + }; + + hash_shape(k.weight_shape); + hash_shape(k.scale_shape); + hash_shape(k.zp_shape); + + hash_combine(h, std::hash()(k.weight_dt.to_string())); + hash_combine(h, std::hash()(k.scale_dt.to_string())); + hash_combine(h, std::hash()(k.zp_dt.to_string())); + return h; + } + }; + + static std::unordered_map s_gemm_cache; }; #endif } // namespace ov::intel_gpu::ocl From 8d989eb674e6c023ea4dc824d788635d2c87daf5 Mon Sep 17 00:00:00 2001 From: River Date: Thu, 4 Dec 2025 23:06:16 +0800 Subject: [PATCH 20/20] clang-format issue --- .../intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp index 836b66c851a2e0..e8a8945552728a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_base.hpp @@ -9,8 +9,6 @@ #include "moe_gemm_base.hpp" -// #define ENABLE_ONEDNN_FOR_GPU - namespace ov::intel_gpu::ocl { #define MOE_INTERNAL_BUFFER_TOPK_IDX 0 // topk_idx @@ -48,7 +46,6 @@ struct moe_3gemm_config { bool has_batch_dim = false; // 0 - pa, 1 - non-pa }; -struct MoE3GemmRuntimeParams : public MoEGemmRuntimeParams { -}; +struct MoE3GemmRuntimeParams : public MoEGemmRuntimeParams {}; } // namespace ov::intel_gpu::ocl \ No newline at end of file