issue/40: 实现沐曦rms_norm算子

qinyiqun · qinyiqun · commit 886e6aec2111 · 2025-04-09T05:46:31.000Z
diff --git a/src/infiniop/devices/maca/common_maca.h b/src/infiniop/devices/maca/common_maca.h
@@ -8,6 +8,11 @@
 #define CHECK_MCBLAS(API) CHECK_INTERNAL(API, HCBLAS_STATUS_SUCCESS)
 #define CHECK_MCDNN(API) CHECK_INTERNAL(API, HCDNN_STATUS_SUCCESS)
 
+#define INFINIOP_MACA_KERNEL __global__ void
+
+#define MACA_BLOCK_SIZE_1024 1024
+#define MACA_BLOCK_SIZE_512 512
+
 namespace device::maca {
 
 class Handle::Internal {
@@ -17,9 +22,24 @@ class Handle::Internal {
     template <typename T>
     using Fn = std::function<infiniStatus_t(T)>;
 
+    int _warp_size,
+        _max_threads_per_block,
+        _block_size[3],
+        _grid_size[3];
+
 public:
+    Internal(int);
     infiniStatus_t useMcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const;
     infiniStatus_t useMcdnn(hcStream_t stream, const Fn<hcdnnHandle_t> &f) const;
+
+    int warpSize() const;
+    int maxThreadsPerBlock() const;
+    int blockSizeX() const;
+    int blockSizeY() const;
+    int blockSizeZ() const;
+    int gridSizeX() const;
+    int gridSizeY() const;
+    int gridSizeZ() const;
 };
 
 hcdnnDataType_t getHcdnnDtype(infiniDtype_t dt);
diff --git a/src/infiniop/devices/maca/maca_handle.cc b/src/infiniop/devices/maca/maca_handle.cc
@@ -3,14 +3,27 @@
 namespace device::maca {
 Handle::Handle(infiniDevice_t device, int device_id)
     : InfiniopHandle{device, device_id},
-      _internal(std::make_shared<Handle::Internal>()) {}
+      _internal(std::make_shared<Handle::Internal>(device_id)) {}
 
 Handle::Handle(int device_id) : Handle(INFINI_DEVICE_METAX, device_id) {}
 
 auto Handle::internal() const -> const std::shared_ptr<Internal> & {
     return _internal;
 }
 
+Handle::Internal::Internal(int device_id) {
+    hcDeviceProp_t prop;
+    hcGetDeviceProperties(&prop, device_id);
+    _warp_size = prop.warpSize;
+    _max_threads_per_block = prop.maxThreadsPerBlock;
+    _block_size[0] = prop.maxThreadsDim[0];
+    _block_size[1] = prop.maxThreadsDim[1];
+    _block_size[2] = prop.maxThreadsDim[2];
+    _grid_size[0] = prop.maxGridSize[0];
+    _grid_size[1] = prop.maxGridSize[1];
+    _grid_size[2] = prop.maxGridSize[2];
+}
+
 infiniStatus_t Handle::Internal::useMcblas(hcStream_t stream, const Fn<hcblasHandle_t> &f) const {
     auto handle = mcblas_handles.pop();
     if (!handle) {
@@ -33,6 +46,15 @@ infiniStatus_t Handle::Internal::useMcdnn(hcStream_t stream, const Fn<hcdnnHandl
     return INFINI_STATUS_SUCCESS;
 }
 
+int Handle::Internal::warpSize() const { return _warp_size; }
+int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
+int Handle::Internal::blockSizeX() const { return _block_size[0]; }
+int Handle::Internal::blockSizeY() const { return _block_size[1]; }
+int Handle::Internal::blockSizeZ() const { return _block_size[2]; }
+int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
+int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
+int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }
+
 hcdnnDataType_t getHcdnnDtype(infiniDtype_t dt) {
     switch (dt) {
     case INFINI_DTYPE_F16:
diff --git a/src/infiniop/ops/rms_norm/maca/rms_norm_kernel.cuh b/src/infiniop/ops/rms_norm/maca/rms_norm_kernel.cuh
@@ -0,0 +1,36 @@
+#ifndef __RMS_NORM_MACA_KERNEL_H__
+#define __RMS_NORM_MACA_KERNEL_H__
+
+#include "../../../reduce/maca/reduce.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
+INFINIOP_MACA_KERNEL rmsnormBlock(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x,
+    const Tweight *__restrict__ w,
+    size_t dim,
+    float epsilon) {
+    // Each block takes care of a row of continuous data of length dim
+    // Each thread deals with every block_size element in the row
+    auto y_ptr = y + blockIdx.x * stride_y;
+    auto x_ptr = x + blockIdx.x * stride_x;
+    auto w_ptr = w;
+
+    // Block-reduce sum of x^2
+    Tcompute ss = op::common_maca::reduce_op::sumSquared<BLOCK_SIZE, Tdata, Tcompute>(x_ptr, dim);
+
+    // Thread_0 computes RMS=1/sqrt(ss/dim+epsilon) and stores in shared memory
+    __shared__ Tcompute rms;
+    if (threadIdx.x == 0) {
+        rms = Tdata(rsqrtf(ss / Tcompute(dim) + epsilon));
+    }
+    __syncthreads();
+
+    for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
+        y_ptr[i] = Tdata(Tcompute(x_ptr[i]) * Tcompute(w_ptr[i]) * rms);
+    }
+}
+
+#endif
diff --git a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
@@ -0,0 +1,8 @@
+#ifndef __RMS_NORM_MACA_H__
+#define __RMS_NORM_MACA_H__
+
+#include "../rms_norm.h"
+
+DESCRIPTOR(maca)
+
+#endif
diff --git a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
@@ -0,0 +1,99 @@
+#include "../../../devices/maca/common_maca.h"
+#include "rms_norm_kernel.cuh"
+#include "rms_norm_maca.cuh"
+
+namespace op::rms_norm::maca {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::maca::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon) {
+    auto result = RMSNormInfo::create(y_desc, x_desc, w_desc, epsilon);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    // only support contiguous last dimension
+    if (info.x_strides[1] != 1 || info.y_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::maca::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+// launch kernel with different data types
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    uint32_t batch_size, size_t dim,
+    void *y, infiniDtype_t atype, ptrdiff_t stride_y,
+    const void *x, ptrdiff_t stride_x,
+    const void *w, infiniDtype_t wtype,
+    float epsilon,
+    hcStream_t maca_stream) {
+
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
+    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                               \
+        stride_y,                                                                                   \
+        reinterpret_cast<const Tdata *>(x),                                                         \
+        stride_x,                                                                                   \
+        reinterpret_cast<const Tweight *>(w),                                                       \
+        dim,                                                                                        \
+        epsilon)
+
+    if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
+        LAUNCH_KERNEL(half, half, float);
+    } else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(half, float, float);
+    } else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(float, float, float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y, const void *x, const void *w,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto stride_x = _info.x_strides[0];
+    auto stride_y = _info.y_strides[0];
+    auto dim = _info.dim();
+    uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
+    auto maca_stream = reinterpret_cast<hcStream_t>(stream);
+
+    // launch kernel with different block sizes
+    if (_opaque->internal->maxThreadsPerBlock() == MACA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<MACA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == MACA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<MACA_BLOCK_SIZE_512>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::rms_norm::maca
diff --git a/src/infiniop/ops/rms_norm/operator.cc b/src/infiniop/ops/rms_norm/operator.cc
@@ -11,6 +11,9 @@
 #ifdef ENABLE_ASCEND_API
 #include "ascend/rms_norm_aclnn.h"
 #endif
+#ifdef ENABLE_METAX_API
+#include "maca/rms_norm_maca.cuh"
+#endif
 
 __C infiniStatus_t infiniopCreateRMSNormDescriptor(
     infiniopHandle_t handle,
@@ -45,10 +48,8 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
 #ifdef ENABLE_ASCEND_API
         CREATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCreateRMSNormDescriptor((MacaHandle_t)handle, (RMSNormMacaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
-    }
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
@@ -84,10 +85,8 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
 #ifdef ENABLE_ASCEND_API
         GET(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t)desc, size);
-    }
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
@@ -124,10 +123,8 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
 #ifdef ENABLE_ASCEND_API
         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaRMSNorm((RMSNormMacaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
-    }
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
@@ -163,10 +160,8 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
 #ifdef ENABLE_ASCEND_API
         DESTROY(INFINI_DEVICE_ASCEND, ascend)
 #endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t)desc);
-    }
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, maca)
 #endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
diff --git a/src/infiniop/reduce/maca/reduce.cuh b/src/infiniop/reduce/maca/reduce.cuh
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_REDUCE_MACA_H__
+#define __INFINIOP_REDUCE_MACA_H__
+
+#include <cub/block/block_reduce.cuh>
+
+namespace op::common_maca::reduce_op {
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr,
+                                               size_t count) {
+    Tcompute ss = 0;
+
+    // Each thread computes its partial sum
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+    ss += Tcompute(data_ptr[i] * data_ptr[i]);
+    }
+
+    // Use CUB block-level reduction
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Sum(ss);
+}
+
+} // namespace op::common_maca::reduce_op
+
+#endif
diff --git a/src/infinirt/infinirt.cc b/src/infinirt/infinirt.cc
@@ -4,6 +4,8 @@
 #include "bang/infinirt_bang.h"
 #include "cpu/infinirt_cpu.h"
 #include "cuda/infinirt_cuda.cuh"
+#include "maca/infinirt_maca.h"
+#include "musa/infinirt_musa.h"
 
 thread_local infiniDevice_t CURRENT_DEVICE_TYPE = INFINI_DEVICE_CPU;
 thread_local int CURRENT_DEVICE_ID = 0;
@@ -58,6 +60,12 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
         case INFINI_DEVICE_ASCEND:                          \
             _status = infinirt::ascend::API PARAMS;         \
             break;                                          \
+        case INFINI_DEVICE_METAX:                           \
+            _status = infinirt::maca::API PARAMS;           \
+            break;                                          \
+        case INFINI_DEVICE_MOORE:                           \
+            _status = infinirt::musa::API PARAMS;           \
+            break;                                          \
         default:                                            \
             return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
         }                                                   \
diff --git a/xmake/maca.lua b/xmake/maca.lua
@@ -1,11 +1,8 @@
 
 local MACA_ROOT = os.getenv("MACA_PATH") or os.getenv("MACA_HOME") or os.getenv("MACA_ROOT")
-
 add_includedirs(MACA_ROOT .. "/include")
 add_linkdirs(MACA_ROOT .. "/lib")
-add_links("libhcdnn.so")
-add_links("libhcblas.so")
-add_links("libhcruntime.so")
+add_links("hcdnn", "hcblas", "hcruntime")
 
 rule("maca")
     set_extensions(".maca")
@@ -34,21 +31,19 @@ rule_end()
 target("infiniop-metax")
     set_kind("static")
     on_install(function (target) end)
-    add_cxflags("-lstdc++ -Wall -fPIC")
     set_languages("cxx17")
-    set_warnings("all")
-
+    set_warnings("all", "error")
+    add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
     add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
     add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
-
 target_end()
 
 target("infinirt-metax")
     set_kind("static")
     set_languages("cxx17")
     on_install(function (target) end)
     add_deps("infini-utils")
-    -- Add files
-    add_files("$(projectdir)/src/infinirt/maca/*.cc")
-    add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+    set_warnings("all", "error")
+    add_cxflags("-lstdc++ -fPIC")
+    add_files("../src/infinirt/maca/*.cc")
 target_end()