Add Leaky RELU kernel implementation (#2)

abhay-lal · web-flow · commit 63caafa28b23 · 2025-11-07T14:01:11.000-08:00
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ The IRON Python API for Ryzen™ AI NPUs is described in the following paper:
 | [Reduction]() | Reduction | bfloat16 | 🟡 |  |
 | [Dequant](./aie_kernels/aie2/expand.cc) | Dequant Q4NX from [AWQ](https://github.com/mit-han-lab/llm-awq) to bfloat16 | bfloat16 | 🟢 | [example/dequant/](./example/dequant/) |
 | [RELU](./aie_kernels/aie2p/relu.cc) | RELU | bfloat16 | 🟢 | [example/relu/](./example/relu/) |
-| [Leaky RELU]() | Leaky RELU | bfloat16 | ⚪ |  |
+| [Leaky RELU](./aie_kernels/aie2p/leaky_relu.cc) | Leaky RELU kernel | bfloat16 | 🟢 | [example/leaky_relu/](./example/leaky_relu/) |
 | [GELU](./aie_kernels/aie2p/gelu.cc) | GELU | bfloat16 | 🟢 | [example/gelu/](./example/gelu/) |
 | [LayerNorm](./aie_kernels/aie2p/layer_norm.cc) | LayerNorm | bfloat16 | 🟢 | [example/layer_norm/](./example/layer_norm/) |
 | [Convolution]() | Convolution | bfloat16 | 🟡 |  |
diff --git a/aie_kernels/aie2p/CMakeLists.txt b/aie_kernels/aie2p/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 add_aie_kernel(gelu.o SOURCES gelu.cc)
 add_aie_kernel(layer_norm.o SOURCES layer_norm.cc)
+add_aie_kernel(leaky_relu.o SOURCES leaky_relu.cc)
 add_aie_kernel(relu.o SOURCES relu.cc)
 add_aie_kernel(rms_norm.o SOURCES rms_norm.cc)
 add_aie_kernel(silu.o SOURCES silu.cc)
diff --git a/aie_kernels/aie2p/leaky_relu.cc b/aie_kernels/aie2p/leaky_relu.cc
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../aie_kernel_utils.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+
+using namespace aie;
+
+void leaky_relu_vectorized_bf16(bfloat16 *restrict a,
+                                bfloat16 *restrict c,
+                                const int32_t vector_size,
+                                const bfloat16 alpha)
+{
+    event0();
+
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)a);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)c);
+
+    // Broadcast alpha to a vector
+    vector<bfloat16, 32> alpha_vec = aie::broadcast<bfloat16, 32>(alpha);
+    vector<bfloat16, 32> zeroes = aie::zeros<bfloat16, 32>();
+
+    AIE_PREPARE_FOR_PIPELINING
+    AIE_LOOP_MIN_ITERATION_COUNT(32)
+    for (int i = 0; i < vector_size; i += 32) {
+        vector<bfloat16, 32> input = *it_in++;
+        // Leaky RELU: f(x) = max(x, alpha * x) where alpha is typically 0.01
+        // When alpha < 1: if x > 0 then x, else alpha * x
+        vector<bfloat16, 32> alpha_times_input = aie::mul(input, alpha_vec);
+        vector<bfloat16, 32> output = aie::max(input, alpha_times_input);
+        *it_out++ = output;
+    }
+
+    event1();
+
+    return;
+}
+
+extern "C" {
+
+void leaky_relu_bf16(bfloat16 *restrict input, bfloat16 *restrict output, int input_size, bfloat16 alpha)
+{
+    leaky_relu_vectorized_bf16(input, output, input_size, alpha);
+}
+
+} // extern "C"
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(elementwise_add)
 add_subdirectory(gelu)
 add_subdirectory(gemm)
 add_subdirectory(layer_norm)
+add_subdirectory(leaky_relu)
 add_subdirectory(matrix_vector_mul)
 add_subdirectory(mem_copy)
 add_subdirectory(relu)
diff --git a/example/leaky_relu/CMakeLists.txt b/example/leaky_relu/CMakeLists.txt
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+function(add_aie_leaky_relu_design INPUT_LENGTH NUM_COLUMNS NUM_CHANNELS TILE_SIZE TRACE_SIZE)
+
+    set(EXAMPLE "leaky_relu_${NUM_COLUMNS}_cols_${NUM_CHANNELS}_channels_${INPUT_LENGTH}_tile_${TILE_SIZE}")
+    set(GOLDEN_VALUES_PATH "${CMAKE_BINARY_DIR}/example/leaky_relu/golden_${EXAMPLE}")
+    set(INPUT_DATA_TYPE_CPP bfloat16_t)
+    set(OUTPUT_DATA_TYPE_CPP bfloat16_t)
+
+    add_aie_executable(${EXAMPLE}
+        HOST leaky_relu.cpp
+        HOST_FLAGS DTYPE_IN=${INPUT_DATA_TYPE_CPP} DTYPE_OUT=${OUTPUT_DATA_TYPE_CPP}
+        PYTHON leaky_relu.py
+        PYTHON_FLAGS --dev ${DEVICE} -l ${INPUT_LENGTH} --co ${NUM_COLUMNS} --chan ${NUM_CHANNELS} --tile-size ${TILE_SIZE} --trace-size ${TRACE_SIZE} --alpha 0.01 --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir
+        AIE_CORE_KERNELS "leaky_relu.o"
+        OUTPUT_HOST LEAKY_RELU_HOST
+        OUTPUT_XCLBIN LEAKY_RELU_XCLBIN
+        OUTPUT_INSTS LEAKY_RELU_INSTS)
+
+    add_golden_reference_generator(${EXAMPLE}
+        ${CMAKE_CURRENT_SOURCE_DIR}/leaky_relu_golden.py
+        ${GOLDEN_VALUES_PATH}
+        --input_length ${INPUT_LENGTH}
+        --alpha 0.01
+    )
+
+    add_aie_ci_test(${EXAMPLE}
+                    RUN
+                        "${LEAKY_RELU_HOST} -k ${EXAMPLE} -x ${LEAKY_RELU_XCLBIN} -i ${LEAKY_RELU_INSTS} -l ${INPUT_LENGTH} --ref ${GOLDEN_VALUES_PATH}/golden_reference.bin --alpha 0.01"
+                    CHECK
+                        "PASS!"
+                    METRICS
+                        "Latency" [=[Latency \(us\): (?P<metric>\d+)]=]
+                        "Bandwidth" [=[Effective Bandwidth: (?P<metric>[\d\.e\+-]+) GB/s]=]
+    )
+endfunction()
+
+set(INPUT_LENGTHS "2048")
+set(NUM_CHANNELS "1")  # 1 channel for 1 input (like normal RMS norm)
+set(TRACE_SIZE "65536")
+if(EXTENSIVE_TESTING)
+    list(APPEND INPUT_LENGTHS 1024)
+    list(APPEND INPUT_LENGTHS 4096)
+    list(APPEND INPUT_LENGTHS 8192)
+endif()
+
+foreach(INPUT_LENGTH IN LISTS INPUT_LENGTHS)
+    foreach(NUM_COLUMNS RANGE 1 ${MAX_COLUMNS})
+        # Only 1 input, so simple distribution across columns
+        math(EXPR TILE_SIZE "${INPUT_LENGTH} / ${NUM_COLUMNS}")
+        # Cap tile_size at 4096 if it's greater
+        if(TILE_SIZE GREATER 4096)
+            set(TILE_SIZE 4096)
+        endif()
+        # Only proceed if TILE_SIZE * NUM_COLUMNS == INPUT_LENGTH (i.e., division is exact)
+        math(EXPR CHECK_LENGTH "${TILE_SIZE} * ${NUM_COLUMNS}")
+        if(CHECK_LENGTH EQUAL ${INPUT_LENGTH})
+            add_aie_leaky_relu_design(${INPUT_LENGTH} ${NUM_COLUMNS} ${NUM_CHANNELS} ${TILE_SIZE} ${TRACE_SIZE})
+        endif()
+    endforeach()
+endforeach()
+
diff --git a/example/leaky_relu/leaky_relu.cpp b/example/leaky_relu/leaky_relu.cpp
@@ -0,0 +1,189 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "cxxopts.hpp"
+#include "golden_reference_reader.h"
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+int main(int argc, const char *argv[])
+{
+    // Program arguments parsing
+    cxxopts::Options options("Leaky ReLU Test");
+    cxxopts::ParseResult vm;
+
+    options.add_options()("help,h",
+                          "produce help message")("xclbin,x", "the input xclbin path", cxxopts::value<std::string>())(
+        "kernel,k", "the kernel name in the XCLBIN (for instance PP_PRE_FD)", cxxopts::value<std::string>())(
+        "verbosity,v", "the verbosity of the output", cxxopts::value<int>()->default_value("0"))(
+        "instr,i",
+        "path of file containing userspace instructions to be sent to the LX6",
+        cxxopts::value<std::string>())(
+        "length,l", "the length of the transfer in std::bfloat16_t", cxxopts::value<int>()->default_value("4096"))(
+        "ref",
+        "path to golden reference file",
+        cxxopts::value<std::string>()->default_value("golden_leaky_relu/golden_reference.bin"))(
+        "alpha,a", "alpha parameter for Leaky ReLU", cxxopts::value<float>()->default_value("0.01"));
+
+    try {
+        vm = options.parse(argc, argv);
+
+        if (vm.count("help")) {
+            std::cout << options.help() << std::endl;
+            return 1;
+        }
+
+        // Check required options
+        if (!vm.count("xclbin") || !vm.count("kernel") || !vm.count("instr") || !vm.count("ref")) {
+            std::cerr << "Error: Required options missing\n\n";
+            std::cerr << "Usage:\n" << options.help() << std::endl;
+            return 1;
+        }
+    } catch (const cxxopts::exceptions::parsing &e) {
+        std::cerr << e.what() << "\n\n";
+        std::cerr << "Usage:\n" << options.help() << std::endl;
+        return 1;
+    }
+
+    std::vector<uint32_t> instr_v = test_utils::load_instr_binary(vm["instr"].as<std::string>());
+
+    std::string ref_path = vm["ref"].as<std::string>();
+    GoldenReference ref = GoldenReference::fromFile(ref_path);
+
+    int verbosity = vm["verbosity"].as<int>();
+    if (verbosity >= 1)
+        std::cout << "Sequence instr count: " << instr_v.size() << std::endl;
+
+    int N = vm["length"].as<int>();
+    if ((N % 1024)) {
+        std::cerr << "Length must be a multiple of 1024." << std::endl;
+        return 1;
+    }
+
+    float alpha = vm["alpha"].as<float>();
+    std::bfloat16_t alpha_bf16 = static_cast<std::bfloat16_t>(alpha);
+
+    // Start the XRT test code
+    // Get a device handle
+    unsigned int device_index = 0;
+    auto device = xrt::device(device_index);
+
+    // Load the xclbin
+    if (verbosity >= 1)
+        std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << std::endl;
+    auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+    if (verbosity >= 1)
+        std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << std::endl;
+    std::string Node = vm["kernel"].as<std::string>();
+
+    // Get the kernel from the xclbin
+    auto xkernels = xclbin.get_kernels();
+    auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), [Node](xrt::xclbin::kernel &k) {
+        auto name = k.get_name();
+        std::cout << "Name: " << name << std::endl;
+        return name.rfind(Node, 0) == 0;
+    });
+    auto kernelName = xkernel.get_name();
+
+    if (verbosity >= 1)
+        std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+
+    device.register_xclbin(xclbin);
+
+    // get a hardware context
+    if (verbosity >= 1)
+        std::cout << "Getting hardware context." << std::endl;
+    xrt::hw_context context(device, xclbin.get_uuid());
+
+    // get a kernel handle
+    if (verbosity >= 1)
+        std::cout << "Getting handle to kernel:" << kernelName << std::endl;
+    auto kernel = xrt::kernel(context, kernelName);
+
+    auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+    auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+    auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+    auto bo_alpha = xrt::bo(device, sizeof(std::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+    if (verbosity >= 1)
+        std::cout << "Writing data into buffer objects." << std::endl;
+
+    std::bfloat16_t *bufInA = bo_inA.map<std::bfloat16_t *>();
+    memcpy(bufInA, ref.get<std::bfloat16_t>("A")->data(), N * sizeof(std::bfloat16_t));
+
+    std::bfloat16_t *bufAlpha = bo_alpha.map<std::bfloat16_t *>();
+    *bufAlpha = alpha_bf16;
+
+    void *bufInstr = bo_instr.map<void *>();
+    memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+    bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    bo_alpha.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    if (verbosity >= 1)
+        std::cout << "Running Kernel." << std::endl;
+    unsigned int opcode = 3;
+    // Setup run to configure
+    auto cfg_run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out, bo_alpha);
+    cfg_run.wait();
+    auto start = std::chrono::high_resolution_clock::now();
+    // Test run
+    auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out, bo_alpha);
+    ert_cmd_state r = run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    if (r != ERT_CMD_STATE_COMPLETED) {
+        std::cout << "Kernel did not complete. Returned status: " << r << std::endl;
+        return 1;
+    }
+    const float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+
+    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    std::cout << std::endl;
+    std::cout << "Latency (us): " << npu_time << std::endl;
+    std::cout << std::endl;
+
+    double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output
+    double bandwidth_GBps = (total_bytes / (1024 * 1024 * 1024)) / (npu_time * 1e-6);
+    std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl;
+
+    std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
+
+    int errors = 0;
+    auto ref_B = ref.get<std::bfloat16_t>("B");
+
+    for (int i = 0; i < N; i++) {
+        std::bfloat16_t ref_val = (*ref_B)[i];
+        // if (i < 10){
+        //   std::cout << "Index " << i << ": Computed=" << *(bufOut + i) << ", Reference=" << ref_val << std::endl;
+        // }
+        if (!test_utils::nearly_equal(*(bufOut + i), ref_val, 0.01, 1e-6)) {
+            errors++;
+            // Print the first 100 mismatches
+            if (errors <= 100) {
+                std::cout << "Mismatch at index " << i << ": " << "Expected: " << ref_val << ", "
+                          << "Got: " << *(bufOut + i) << std::endl;
+            }
+        }
+    }
+
+    if (!errors) {
+        std::cout << std::endl << "PASS!" << std::endl << std::endl;
+        return 0;
+    } else {
+        std::cout << std::endl << errors << " mismatches." << std::endl << std::endl;
+        std::cout << std::endl << "fail." << std::endl << std::endl;
+        return 1;
+    }
+}
diff --git a/example/leaky_relu/leaky_relu.py b/example/leaky_relu/leaky_relu.py
diff --git a/example/leaky_relu/leaky_relu_golden.py b/example/leaky_relu/leaky_relu_golden.py