Skip to content

Commit 63caafa

Browse files
authored
Add Leaky RELU kernel implementation (#2)
1 parent 695509f commit 63caafa

File tree

8 files changed

+590
-1
lines changed

8 files changed

+590
-1
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ The IRON Python API for Ryzen™ AI NPUs is described in the following paper:
5252
| [Reduction]() | Reduction | bfloat16 | 🟡 | |
5353
| [Dequant](./aie_kernels/aie2/expand.cc) | Dequant Q4NX from [AWQ](https://github.com/mit-han-lab/llm-awq) to bfloat16 | bfloat16 | 🟢 | [example/dequant/](./example/dequant/) |
5454
| [RELU](./aie_kernels/aie2p/relu.cc) | RELU | bfloat16 | 🟢 | [example/relu/](./example/relu/) |
55-
| [Leaky RELU]() | Leaky RELU | bfloat16 | | |
55+
| [Leaky RELU](./aie_kernels/aie2p/leaky_relu.cc) | Leaky RELU kernel | bfloat16 | 🟢 | [example/leaky_relu/](./example/leaky_relu/) |
5656
| [GELU](./aie_kernels/aie2p/gelu.cc) | GELU | bfloat16 | 🟢 | [example/gelu/](./example/gelu/) |
5757
| [LayerNorm](./aie_kernels/aie2p/layer_norm.cc) | LayerNorm | bfloat16 | 🟢 | [example/layer_norm/](./example/layer_norm/) |
5858
| [Convolution]() | Convolution | bfloat16 | 🟡 | |

aie_kernels/aie2p/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
add_aie_kernel(gelu.o SOURCES gelu.cc)
55
add_aie_kernel(layer_norm.o SOURCES layer_norm.cc)
6+
add_aie_kernel(leaky_relu.o SOURCES leaky_relu.cc)
67
add_aie_kernel(relu.o SOURCES relu.cc)
78
add_aie_kernel(rms_norm.o SOURCES rms_norm.cc)
89
add_aie_kernel(silu.o SOURCES silu.cc)

aie_kernels/aie2p/leaky_relu.cc

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include "../aie_kernel_utils.h"
5+
6+
#include <aie_api/aie.hpp>
7+
#include <stdint.h>
8+
9+
using namespace aie;
10+
11+
void leaky_relu_vectorized_bf16(bfloat16 *restrict a,
12+
bfloat16 *restrict c,
13+
const int32_t vector_size,
14+
const bfloat16 alpha)
15+
{
16+
event0();
17+
18+
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)a);
19+
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)c);
20+
21+
// Broadcast alpha to a vector
22+
vector<bfloat16, 32> alpha_vec = aie::broadcast<bfloat16, 32>(alpha);
23+
vector<bfloat16, 32> zeroes = aie::zeros<bfloat16, 32>();
24+
25+
AIE_PREPARE_FOR_PIPELINING
26+
AIE_LOOP_MIN_ITERATION_COUNT(32)
27+
for (int i = 0; i < vector_size; i += 32) {
28+
vector<bfloat16, 32> input = *it_in++;
29+
// Leaky RELU: f(x) = max(x, alpha * x) where alpha is typically 0.01
30+
// When alpha < 1: if x > 0 then x, else alpha * x
31+
vector<bfloat16, 32> alpha_times_input = aie::mul(input, alpha_vec);
32+
vector<bfloat16, 32> output = aie::max(input, alpha_times_input);
33+
*it_out++ = output;
34+
}
35+
36+
event1();
37+
38+
return;
39+
}
40+
41+
extern "C" {
42+
43+
void leaky_relu_bf16(bfloat16 *restrict input, bfloat16 *restrict output, int input_size, bfloat16 alpha)
44+
{
45+
leaky_relu_vectorized_bf16(input, output, input_size, alpha);
46+
}
47+
48+
} // extern "C"

example/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ add_subdirectory(elementwise_add)
88
add_subdirectory(gelu)
99
add_subdirectory(gemm)
1010
add_subdirectory(layer_norm)
11+
add_subdirectory(leaky_relu)
1112
add_subdirectory(matrix_vector_mul)
1213
add_subdirectory(mem_copy)
1314
add_subdirectory(relu)

example/leaky_relu/CMakeLists.txt

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
function(add_aie_leaky_relu_design INPUT_LENGTH NUM_COLUMNS NUM_CHANNELS TILE_SIZE TRACE_SIZE)
5+
6+
set(EXAMPLE "leaky_relu_${NUM_COLUMNS}_cols_${NUM_CHANNELS}_channels_${INPUT_LENGTH}_tile_${TILE_SIZE}")
7+
set(GOLDEN_VALUES_PATH "${CMAKE_BINARY_DIR}/example/leaky_relu/golden_${EXAMPLE}")
8+
set(INPUT_DATA_TYPE_CPP bfloat16_t)
9+
set(OUTPUT_DATA_TYPE_CPP bfloat16_t)
10+
11+
add_aie_executable(${EXAMPLE}
12+
HOST leaky_relu.cpp
13+
HOST_FLAGS DTYPE_IN=${INPUT_DATA_TYPE_CPP} DTYPE_OUT=${OUTPUT_DATA_TYPE_CPP}
14+
PYTHON leaky_relu.py
15+
PYTHON_FLAGS --dev ${DEVICE} -l ${INPUT_LENGTH} --co ${NUM_COLUMNS} --chan ${NUM_CHANNELS} --tile-size ${TILE_SIZE} --trace-size ${TRACE_SIZE} --alpha 0.01 --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir
16+
AIE_CORE_KERNELS "leaky_relu.o"
17+
OUTPUT_HOST LEAKY_RELU_HOST
18+
OUTPUT_XCLBIN LEAKY_RELU_XCLBIN
19+
OUTPUT_INSTS LEAKY_RELU_INSTS)
20+
21+
add_golden_reference_generator(${EXAMPLE}
22+
${CMAKE_CURRENT_SOURCE_DIR}/leaky_relu_golden.py
23+
${GOLDEN_VALUES_PATH}
24+
--input_length ${INPUT_LENGTH}
25+
--alpha 0.01
26+
)
27+
28+
add_aie_ci_test(${EXAMPLE}
29+
RUN
30+
"${LEAKY_RELU_HOST} -k ${EXAMPLE} -x ${LEAKY_RELU_XCLBIN} -i ${LEAKY_RELU_INSTS} -l ${INPUT_LENGTH} --ref ${GOLDEN_VALUES_PATH}/golden_reference.bin --alpha 0.01"
31+
CHECK
32+
"PASS!"
33+
METRICS
34+
"Latency" [=[Latency \(us\): (?P<metric>\d+)]=]
35+
"Bandwidth" [=[Effective Bandwidth: (?P<metric>[\d\.e\+-]+) GB/s]=]
36+
)
37+
endfunction()
38+
39+
set(INPUT_LENGTHS "2048")
40+
set(NUM_CHANNELS "1") # 1 channel for 1 input (like normal RMS norm)
41+
set(TRACE_SIZE "65536")
42+
if(EXTENSIVE_TESTING)
43+
list(APPEND INPUT_LENGTHS 1024)
44+
list(APPEND INPUT_LENGTHS 4096)
45+
list(APPEND INPUT_LENGTHS 8192)
46+
endif()
47+
48+
foreach(INPUT_LENGTH IN LISTS INPUT_LENGTHS)
49+
foreach(NUM_COLUMNS RANGE 1 ${MAX_COLUMNS})
50+
# Only 1 input, so simple distribution across columns
51+
math(EXPR TILE_SIZE "${INPUT_LENGTH} / ${NUM_COLUMNS}")
52+
# Cap tile_size at 4096 if it's greater
53+
if(TILE_SIZE GREATER 4096)
54+
set(TILE_SIZE 4096)
55+
endif()
56+
# Only proceed if TILE_SIZE * NUM_COLUMNS == INPUT_LENGTH (i.e., division is exact)
57+
math(EXPR CHECK_LENGTH "${TILE_SIZE} * ${NUM_COLUMNS}")
58+
if(CHECK_LENGTH EQUAL ${INPUT_LENGTH})
59+
add_aie_leaky_relu_design(${INPUT_LENGTH} ${NUM_COLUMNS} ${NUM_CHANNELS} ${TILE_SIZE} ${TRACE_SIZE})
60+
endif()
61+
endforeach()
62+
endforeach()
63+

example/leaky_relu/leaky_relu.cpp

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include "cxxopts.hpp"
5+
#include "golden_reference_reader.h"
6+
#include "test_utils.h"
7+
#include "xrt/xrt_bo.h"
8+
#include "xrt/xrt_device.h"
9+
#include "xrt/xrt_kernel.h"
10+
11+
#include <cstdint>
12+
#include <cstdlib>
13+
#include <fstream>
14+
#include <iostream>
15+
#include <sstream>
16+
#include <string>
17+
#include <vector>
18+
19+
int main(int argc, const char *argv[])
20+
{
21+
// Program arguments parsing
22+
cxxopts::Options options("Leaky ReLU Test");
23+
cxxopts::ParseResult vm;
24+
25+
options.add_options()("help,h",
26+
"produce help message")("xclbin,x", "the input xclbin path", cxxopts::value<std::string>())(
27+
"kernel,k", "the kernel name in the XCLBIN (for instance PP_PRE_FD)", cxxopts::value<std::string>())(
28+
"verbosity,v", "the verbosity of the output", cxxopts::value<int>()->default_value("0"))(
29+
"instr,i",
30+
"path of file containing userspace instructions to be sent to the LX6",
31+
cxxopts::value<std::string>())(
32+
"length,l", "the length of the transfer in std::bfloat16_t", cxxopts::value<int>()->default_value("4096"))(
33+
"ref",
34+
"path to golden reference file",
35+
cxxopts::value<std::string>()->default_value("golden_leaky_relu/golden_reference.bin"))(
36+
"alpha,a", "alpha parameter for Leaky ReLU", cxxopts::value<float>()->default_value("0.01"));
37+
38+
try {
39+
vm = options.parse(argc, argv);
40+
41+
if (vm.count("help")) {
42+
std::cout << options.help() << std::endl;
43+
return 1;
44+
}
45+
46+
// Check required options
47+
if (!vm.count("xclbin") || !vm.count("kernel") || !vm.count("instr") || !vm.count("ref")) {
48+
std::cerr << "Error: Required options missing\n\n";
49+
std::cerr << "Usage:\n" << options.help() << std::endl;
50+
return 1;
51+
}
52+
} catch (const cxxopts::exceptions::parsing &e) {
53+
std::cerr << e.what() << "\n\n";
54+
std::cerr << "Usage:\n" << options.help() << std::endl;
55+
return 1;
56+
}
57+
58+
std::vector<uint32_t> instr_v = test_utils::load_instr_binary(vm["instr"].as<std::string>());
59+
60+
std::string ref_path = vm["ref"].as<std::string>();
61+
GoldenReference ref = GoldenReference::fromFile(ref_path);
62+
63+
int verbosity = vm["verbosity"].as<int>();
64+
if (verbosity >= 1)
65+
std::cout << "Sequence instr count: " << instr_v.size() << std::endl;
66+
67+
int N = vm["length"].as<int>();
68+
if ((N % 1024)) {
69+
std::cerr << "Length must be a multiple of 1024." << std::endl;
70+
return 1;
71+
}
72+
73+
float alpha = vm["alpha"].as<float>();
74+
std::bfloat16_t alpha_bf16 = static_cast<std::bfloat16_t>(alpha);
75+
76+
// Start the XRT test code
77+
// Get a device handle
78+
unsigned int device_index = 0;
79+
auto device = xrt::device(device_index);
80+
81+
// Load the xclbin
82+
if (verbosity >= 1)
83+
std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << std::endl;
84+
auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
85+
86+
if (verbosity >= 1)
87+
std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << std::endl;
88+
std::string Node = vm["kernel"].as<std::string>();
89+
90+
// Get the kernel from the xclbin
91+
auto xkernels = xclbin.get_kernels();
92+
auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), [Node](xrt::xclbin::kernel &k) {
93+
auto name = k.get_name();
94+
std::cout << "Name: " << name << std::endl;
95+
return name.rfind(Node, 0) == 0;
96+
});
97+
auto kernelName = xkernel.get_name();
98+
99+
if (verbosity >= 1)
100+
std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>() << "\n";
101+
102+
device.register_xclbin(xclbin);
103+
104+
// get a hardware context
105+
if (verbosity >= 1)
106+
std::cout << "Getting hardware context." << std::endl;
107+
xrt::hw_context context(device, xclbin.get_uuid());
108+
109+
// get a kernel handle
110+
if (verbosity >= 1)
111+
std::cout << "Getting handle to kernel:" << kernelName << std::endl;
112+
auto kernel = xrt::kernel(context, kernelName);
113+
114+
auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
115+
auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
116+
auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
117+
auto bo_alpha = xrt::bo(device, sizeof(std::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
118+
119+
if (verbosity >= 1)
120+
std::cout << "Writing data into buffer objects." << std::endl;
121+
122+
std::bfloat16_t *bufInA = bo_inA.map<std::bfloat16_t *>();
123+
memcpy(bufInA, ref.get<std::bfloat16_t>("A")->data(), N * sizeof(std::bfloat16_t));
124+
125+
std::bfloat16_t *bufAlpha = bo_alpha.map<std::bfloat16_t *>();
126+
*bufAlpha = alpha_bf16;
127+
128+
void *bufInstr = bo_instr.map<void *>();
129+
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
130+
131+
bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
132+
bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
133+
bo_alpha.sync(XCL_BO_SYNC_BO_TO_DEVICE);
134+
135+
if (verbosity >= 1)
136+
std::cout << "Running Kernel." << std::endl;
137+
unsigned int opcode = 3;
138+
// Setup run to configure
139+
auto cfg_run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out, bo_alpha);
140+
cfg_run.wait();
141+
auto start = std::chrono::high_resolution_clock::now();
142+
// Test run
143+
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out, bo_alpha);
144+
ert_cmd_state r = run.wait();
145+
auto stop = std::chrono::high_resolution_clock::now();
146+
if (r != ERT_CMD_STATE_COMPLETED) {
147+
std::cout << "Kernel did not complete. Returned status: " << r << std::endl;
148+
return 1;
149+
}
150+
const float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
151+
152+
bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
153+
std::cout << std::endl;
154+
std::cout << "Latency (us): " << npu_time << std::endl;
155+
std::cout << std::endl;
156+
157+
double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output
158+
double bandwidth_GBps = (total_bytes / (1024 * 1024 * 1024)) / (npu_time * 1e-6);
159+
std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl;
160+
161+
std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
162+
163+
int errors = 0;
164+
auto ref_B = ref.get<std::bfloat16_t>("B");
165+
166+
for (int i = 0; i < N; i++) {
167+
std::bfloat16_t ref_val = (*ref_B)[i];
168+
// if (i < 10){
169+
// std::cout << "Index " << i << ": Computed=" << *(bufOut + i) << ", Reference=" << ref_val << std::endl;
170+
// }
171+
if (!test_utils::nearly_equal(*(bufOut + i), ref_val, 0.01, 1e-6)) {
172+
errors++;
173+
// Print the first 100 mismatches
174+
if (errors <= 100) {
175+
std::cout << "Mismatch at index " << i << ": " << "Expected: " << ref_val << ", "
176+
<< "Got: " << *(bufOut + i) << std::endl;
177+
}
178+
}
179+
}
180+
181+
if (!errors) {
182+
std::cout << std::endl << "PASS!" << std::endl << std::endl;
183+
return 0;
184+
} else {
185+
std::cout << std::endl << errors << " mismatches." << std::endl << std::endl;
186+
std::cout << std::endl << "fail." << std::endl << std::endl;
187+
return 1;
188+
}
189+
}

0 commit comments

Comments
 (0)