From c938a171bad63c36774f80de9c822875ad671437 Mon Sep 17 00:00:00 2001 From: andrej Date: Mon, 24 Jun 2024 10:37:05 -0700 Subject: [PATCH 1/8] [matmul] re-introduce support for i16 datatype --- .../matrix_multiplication/makefile-common | 21 ++++++++- .../matrix_multiplication/single_core/aie2.py | 39 ++++++++++----- .../basic/matrix_multiplication/test.cpp | 29 ++++++++---- .../matrix_multiplication/whole_array/aie2.py | 47 ++++++++++++++----- 4 files changed, 99 insertions(+), 37 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index 9f336f1099..f598fe74fc 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -37,6 +37,22 @@ include ${current_dir}../../makefile-common M?=512 K?=512 N?=512 +dtype_in?=bf16 +dtype_out?=bf16 + +ifeq ($(dtype_in),bf16) + dtype_in_cpp=std::bfloat16_t +endif +ifeq ($(dtype_out),bf16) + dtype_out_cpp=std::bfloat16_t +endif + +ifeq ($(dtype_in),i16) + dtype_in_cpp=int16_t +endif +ifeq ($(dtype_out),i16) + dtype_out_cpp=int16_t +endif trace_size?=65536 @@ -46,7 +62,7 @@ xclbin_target?=build/final_${target_suffix}.xclbin insts_target?=build/insts_${target_suffix}.txt runargs?=-v 2 --warmup 1 --iters 1 -aieargs+=-M $M -K $K -N $N +aieargs+=-M $M -K $K -N $N --dtype_in ${dtype_in} --dtype_out ${dtype_out} kernels_dir=${srcdir}/../../../../aie_kernels/aie2 @@ -69,7 +85,8 @@ ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o} ${targetname}.exe: ${srcdir}/test.cpp ${srcdir}/../test.cpp ${srcdir}/../common.h rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir}/.. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir} + cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb -DDTYPE_IN=${dtype_in_cpp} -DDTYPE_OUT=${dtype_out_cpp}" \ + cmake ${srcdir}/.. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 5eef847850..a4f654e71e 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -26,11 +26,13 @@ def main(): argparser.add_argument("-m", type=int, default=64) argparser.add_argument("-k", type=int, default=64) argparser.add_argument("-n", type=int, default=64) + argparser.add_argument("--dtype_in", type=str, choices=["bf16", "i16"], default="bf16") + argparser.add_argument("--dtype_out", type=str, choices=["bf16", "i16"], default="bf16") args = argparser.parse_args() - my_matmul(args.M, args.K, args.N, args.m, args.k, args.n) + my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out) -def my_matmul(M, K, N, m, k, n): +def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str): assert M % m == 0 assert K % k == 0 @@ -44,10 +46,21 @@ def my_matmul(M, K, N, m, k, n): assert k % s == 0 assert n % t == 0 - vectorized = True + vectorized = False enable_tracing = False trace_size = 65536 + dtype_in = None + if dtype_in_str == "bf16": + dtype_in = T.bf16 + elif dtype_in_str == "i16": + dtype_in = T.i16 + dtype_out = None + if dtype_out_str == "bf16": + dtype_out = T.bf16 + elif dtype_out_str == "i16": + dtype_out = T.i16 + A_sz = M * K B_sz = K * N C_sz = M * N @@ -68,23 +81,23 @@ def my_matmul(M, K, N, m, k, n): @device(AIEDevice.npu1_1col) def device_body(): - memref_a_ty = T.memref(m, k, T.bf16()) - memref_b_ty = T.memref(k, n, T.bf16()) - memref_c_ty = T.memref(m, n, T.bf16()) + memref_a_ty = T.memref(m, k, dtype_in()) + memref_b_ty = T.memref(k, n, dtype_in()) + memref_c_ty = T.memref(m, n, dtype_out()) ofifo_memref_a_ty = TypeAttr.get(ObjectFifoType.get(memref_a_ty)) ofifo_memref_b_ty = TypeAttr.get(ObjectFifoType.get(memref_b_ty)) ofifo_memref_c_ty = TypeAttr.get(ObjectFifoType.get(memref_c_ty)) # AIE Core Function declarations - zero_scalar = external_func("zero_scalar_bf16", inputs=[memref_c_ty]) - zero = external_func("zero_bf16", inputs=[memref_c_ty]) + zero_scalar = external_func(f"zero_scalar_{dtype_out_str}", inputs=[memref_c_ty]) + zero = external_func(f"zero_{dtype_out_str}", inputs=[memref_c_ty]) matmul_scalar = external_func( - "matmul_scalar_bf16_bf16", + f"matmul_scalar_{dtype_in_str}_{dtype_out_str}", inputs=[memref_a_ty, memref_b_ty, memref_c_ty], ) matmul = external_func( - "matmul_bf16_bf16", inputs=[memref_a_ty, memref_b_ty, memref_c_ty] + f"matmul_{dtype_in_str}_{dtype_out_str}", inputs=[memref_a_ty, memref_b_ty, memref_c_ty] ) # Tile declarations @@ -196,9 +209,9 @@ def core_body(): # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(A_sz, T.bf16()), - T.memref(B_sz, T.bf16()), - T.memref(C_sz, T.bf16()), + T.memref(A_sz, dtype_in()), + T.memref(B_sz, dtype_in()), + T.memref(C_sz, dtype_out()), ) def sequence(A, B, C): diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp index c838f30aeb..71d0405b18 100644 --- a/programming_examples/basic/matrix_multiplication/test.cpp +++ b/programming_examples/basic/matrix_multiplication/test.cpp @@ -28,12 +28,21 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -using A_DATATYPE = std::bfloat16_t; -using B_DATATYPE = std::bfloat16_t; -using C_DATATYPE = std::bfloat16_t; +#ifndef DTYPE_IN +#define DTYPE_IN std::bfloat16_t +#endif +#ifndef DTYPE_OUT +#define DTYPE_OUT std::bfloat16_t +#endif +using A_DATATYPE = DTYPE_IN; +using B_DATATYPE = DTYPE_IN; +using C_DATATYPE = DTYPE_OUT; using ACC_DATATYPE = float; #endif +#define XSTR(X) STR(X) +#define STR(X) #X + constexpr long long verify_stochastic_threshold = 1024 * 1024 * 1024; constexpr int verify_stochastic_n_samples = 1000; @@ -140,7 +149,7 @@ int main(int argc, const char *argv[]) { std::vector AVec(A_VOLUME); for (int i = 0; i < A_VOLUME; i++) { AVec[i] = matmul_common::random_bfloat16_t(); - // AVec[i] = i; + //AVec[i] = i; } memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE))); B_DATATYPE *bufB = bo_b.map(); @@ -148,11 +157,11 @@ int main(int argc, const char *argv[]) { for (int i = 0; i < B_VOLUME; i++) { BVec[i] = matmul_common::random_bfloat16_t(); // Diagonal: - // if(i % N == i / N) { - // BVec[i] = 1.0; - // } else { - // BVec[i] = 0.0; - // } + //if(i % N == i / N) { + // BVec[i] = 1.0; + //} else { + // BVec[i] = 0.0; + //} } memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); @@ -162,6 +171,8 @@ int main(int argc, const char *argv[]) { memset(bufOut, 0, OUT_SIZE); if (verbosity >= 2) { + std::cout << "DTYPE_IN = " XSTR(DTYPE_IN) "\n"; + std::cout << "DTYPE_OUT = " XSTR(DTYPE_OUT) "\n"; std::cout << "A = \n"; matmul_common::print_matrix(AVec, K); std::cout << "B = \n"; diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 0bd8d119fb..fb58b0b05c 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -27,9 +27,11 @@ def main(): argparser.add_argument("-k", type=int, default=64) argparser.add_argument("-n", type=int, default=64) argparser.add_argument("--n-aie-cols", type=int, choices=[1, 2, 4], default=4) + argparser.add_argument("--dtype_in", type=str, choices=["bf16", "i16"], default="bf16") + argparser.add_argument("--dtype_out", type=str, choices=["bf16", "i16"], default="bf16") args = argparser.parse_args() with mlir_mod_ctx() as ctx: - my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.n_aie_cols) + my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.n_aie_cols, args.dtype_in, args.dtype_out) # print(ctx.module.operation.verify()) print(ctx.module) @@ -38,7 +40,7 @@ def ceildiv(a, b): return (a + b - 1) // b -def my_matmul(M, K, N, m, k, n, n_aie_cols): +def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str): r = 4 s = 8 t = 4 @@ -46,6 +48,18 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols): n_aie_rows = 4 n_aie_cores = n_aie_rows * n_aie_cols + dtype_in = None + if dtype_in_str == "bf16": + dtype_in = T.bf16 + elif dtype_in_str == "i16": + dtype_in = T.i16 + dtype_out = None + if dtype_out_str == "bf16": + dtype_out = T.bf16 + elif dtype_out_str == "i16": + dtype_out = T.i16 + + # Input matrix A: # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These # blocks are _broadcast_ across AIE core columns, then _distributed_ across @@ -90,24 +104,31 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols): @device(dev) def device_body(): - A_l2_memref_ty = T.memref(m * k * n_A_tiles_per_shim, T.bf16()) - B_l2_memref_ty = T.memref(k * n, T.bf16()) - C_l2_memref_ty = T.memref(m * n * n_aie_rows, T.bf16()) - A_l1_memref_ty = T.memref(m, k, T.bf16()) - B_l1_memref_ty = T.memref(k, n, T.bf16()) - C_l1_memref_ty = T.memref(m, n, T.bf16()) + A_l2_memref_ty = T.memref(m * k * n_A_tiles_per_shim, dtype_in()) + B_l2_memref_ty = T.memref(k * n, dtype_in()) + C_l2_memref_ty = T.memref(m * n * n_aie_rows, dtype_out()) + A_l1_memref_ty = T.memref(m, k, dtype_in()) + B_l1_memref_ty = T.memref(k, n, dtype_in()) + C_l1_memref_ty = T.memref(m, n, dtype_out()) # AIE Core Function declarations zero_scalar = external_func("zero_scalar_bf16", inputs=[C_l1_memref_ty]) - zero = external_func("zero_bf16", inputs=[C_l1_memref_ty]) + zero_scalar = external_func(f"zero_scalar_{dtype_out_str}", inputs=[C_l1_memref_ty]) + zero = external_func(f"zero_{dtype_out_str}", inputs=[C_l1_memref_ty]) matmul_scalar = external_func( "matmul_scalar_bf16_bf16", inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty], ) + matmul_scalar = external_func( + f"matmul_scalar_{dtype_in_str}_{dtype_out_str}", + inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty], + ) matmul = external_func( - "matmul_bf16_bf16", inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty] + f"matmul_{dtype_in_str}_{dtype_out_str}", + inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty] ) + # Tile declarations as tile[row][col] tiles = [ [tile(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6) @@ -250,9 +271,9 @@ def core_body(): # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(M * K, T.bf16()), - T.memref(K * N, T.bf16()), - T.memref(M * N, T.bf16()), + T.memref(M * K, dtype_in()), + T.memref(K * N, dtype_in()), + T.memref(M * N, dtype_out()), ) def sequence(A, B, C): # We are limited in the number of BDs. After synchronizing, we can reuse BDs. From 23258b7650ca8bca1dde59bf4599af2ec372cc01 Mon Sep 17 00:00:00 2001 From: andrej Date: Mon, 24 Jun 2024 10:52:40 -0700 Subject: [PATCH 2/8] [matmul] allow changeable acc datatype --- .../basic/matrix_multiplication/common.h | 17 +++++++++-------- .../basic/matrix_multiplication/makefile-common | 4 +++- .../matrix_multiplication/single_core/aie2.py | 2 +- .../basic/matrix_multiplication/test.cpp | 13 ++++++++++--- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index b2c6c14b53..892aba5d6a 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -237,10 +237,8 @@ struct error { template std::optional> -verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual) { - const float absTol = 0.5; - const float relTol = 0.05; - if (!nearly_equal(expected, actual, relTol, absTol)) { +verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual, float abs_tol, float rel_tol) { + if (!nearly_equal(expected, actual, rel_tol, abs_tol)) { return (struct error){row, col, expected, actual}; } return std::nullopt; @@ -275,7 +273,8 @@ void print_progress_bar(std::ostream &os, double progress, int len = 75) { template int verify(int M, int N, int K, std::vector A, std::vector B, - std::vector C, int verbosity = 0) { + std::vector C, int verbosity = 0, + float abs_tol = 0.5, float rel_tol = 0.05) { int n_errors = 0; std::vector> errors; Tout max_rel_error = (Tout)0.0f; @@ -286,7 +285,8 @@ int verify(int M, int N, int K, std::vector A, std::vector B, for (int row = 0; row < M; row++) { for (int col = 0; col < N; col++) { std::optional> error = verify_single( - std::cout, row, col, CRef[row * N + col], C[row * N + col]); + std::cout, row, col, CRef[row * N + col], C[row * N + col], + abs_tol, rel_tol); if (error.has_value()) { if (n_errors < max_printable_errors) { errors.push_back(*error); @@ -316,7 +316,8 @@ int verify(int M, int N, int K, std::vector A, std::vector B, template int verify_stochastic(int M, int N, int K, std::vector A, std::vector B, std::vector C, int n_samples, - int verbosity = 0) { + int verbosity = 0, + float abs_tol=0.5, float rel_tol=0.05) { std::mt19937 rng; auto rows = std::views::iota(0, M); auto cols = std::views::iota(0, N); @@ -343,7 +344,7 @@ int verify_stochastic(int M, int N, int K, std::vector A, } Tout ref = mul_acc(M, N, K, row, col, A, B); std::optional> error = - verify_single(std::cout, row, col, ref, C[row * N + col]); + verify_single(std::cout, row, col, ref, C[row * N + col], abs_tol, rel_tol); if (error.has_value()) { if (n_errors < max_printable_errors) { errors.push_back(*error); diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index f598fe74fc..a0fcf77986 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -45,6 +45,7 @@ ifeq ($(dtype_in),bf16) endif ifeq ($(dtype_out),bf16) dtype_out_cpp=std::bfloat16_t + dtype_acc_cpp=float endif ifeq ($(dtype_in),i16) @@ -52,6 +53,7 @@ ifeq ($(dtype_in),i16) endif ifeq ($(dtype_out),i16) dtype_out_cpp=int16_t + dtype_acc_cpp=int16_t endif trace_size?=65536 @@ -85,7 +87,7 @@ ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o} ${targetname}.exe: ${srcdir}/test.cpp ${srcdir}/../test.cpp ${srcdir}/../common.h rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb -DDTYPE_IN=${dtype_in_cpp} -DDTYPE_OUT=${dtype_out_cpp}" \ + cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb -DDTYPE_IN=${dtype_in_cpp} -DDTYPE_OUT=${dtype_out_cpp} -DDTYPE_ACC=${dtype_acc_cpp}" \ cmake ${srcdir}/.. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index a4f654e71e..1835e5d47b 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -46,7 +46,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str): assert k % s == 0 assert n % t == 0 - vectorized = False + vectorized = True enable_tracing = False trace_size = 65536 diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp index 71d0405b18..645bef5f7b 100644 --- a/programming_examples/basic/matrix_multiplication/test.cpp +++ b/programming_examples/basic/matrix_multiplication/test.cpp @@ -34,10 +34,13 @@ #ifndef DTYPE_OUT #define DTYPE_OUT std::bfloat16_t #endif +#ifndef DTYPE_ACC +#define DTYPE_ACC float +#endif using A_DATATYPE = DTYPE_IN; using B_DATATYPE = DTYPE_IN; using C_DATATYPE = DTYPE_OUT; -using ACC_DATATYPE = float; +using ACC_DATATYPE = DTYPE_ACC; #endif #define XSTR(X) STR(X) @@ -46,6 +49,9 @@ using ACC_DATATYPE = float; constexpr long long verify_stochastic_threshold = 1024 * 1024 * 1024; constexpr int verify_stochastic_n_samples = 1000; +constexpr float abs_tol = 0.5; +constexpr float rel_tol = 0.05; + namespace po = boost::program_options; int main(int argc, const char *argv[]) { @@ -173,6 +179,7 @@ int main(int argc, const char *argv[]) { if (verbosity >= 2) { std::cout << "DTYPE_IN = " XSTR(DTYPE_IN) "\n"; std::cout << "DTYPE_OUT = " XSTR(DTYPE_OUT) "\n"; + std::cout << "Verification tolerance " << abs_tol << " absolute, " << rel_tol << " relative.\n"; std::cout << "A = \n"; matmul_common::print_matrix(AVec, K); std::cout << "B = \n"; @@ -232,10 +239,10 @@ int main(int argc, const char *argv[]) { if (do_verify_stochastic) { errors = matmul_common::verify_stochastic( - M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity); + M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity, abs_tol, rel_tol); } else { errors = matmul_common::verify( - M, N, K, AVec, BVec, CVec); + M, N, K, AVec, BVec, CVec, abs_tol, rel_tol); } auto vstop = std::chrono::system_clock::now(); float vtime = From 73946a6f50f8b1b114c7ea8b782fe1b6c1ae450e Mon Sep 17 00:00:00 2001 From: andrej Date: Mon, 24 Jun 2024 14:39:29 -0600 Subject: [PATCH 3/8] [matmul] for int16, fix microkernel size; for single_core with large M, fix offsets for reused BDs into C --- .../matrix_multiplication/single_core/aie2.py | 21 ++++++++++++------- .../matrix_multiplication/whole_array/aie2.py | 12 ++++++++--- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 1835e5d47b..787e3f26b9 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -32,15 +32,24 @@ def main(): my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out) +def ceildiv(a, b): + return (a + b - 1) // b + + def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str): assert M % m == 0 assert K % k == 0 assert N % n == 0 - r = 4 - s = 8 - t = 4 + if dtype_in_str == "bf16": + r = 4 + s = 8 + t = 4 + elif dtype_in_str == "i16": + r = 4 + s = 4 + t = 4 assert m % r == 0 assert k % s == 0 @@ -226,10 +235,8 @@ def sequence(A, B, C): # only do 5 tile rows at a time before synchronizing, so we can reuse BDs rows_per_block = 5 - for tile_row_block in range( - (M_div_m + rows_per_block - 1) // rows_per_block - ): - C_row_offset = tile_row_block * rows_per_block * m * N + for tile_row_block in range(ceildiv(M_div_m, rows_per_block)): + C_row_offset = tile_row_block * rows_per_block * m * N * 2 num_tile_rows = min( [rows_per_block, M_div_m - tile_row_block * rows_per_block] ) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index fb58b0b05c..42391c4d3a 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -41,9 +41,6 @@ def ceildiv(a, b): def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str): - r = 4 - s = 8 - t = 4 n_aie_rows = 4 n_aie_cores = n_aie_rows * n_aie_cols @@ -59,6 +56,15 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str): elif dtype_out_str == "i16": dtype_out = T.i16 + if dtype_in_str == "bf16": + r = 4 + s = 8 + t = 4 + elif dtype_in_str == "i16": + r = 4 + s = 4 + t = 4 + # Input matrix A: # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These From 07745f54cc0a5e93e3fe6f713afbe1100f56ffab Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 27 Jun 2024 11:00:37 -0600 Subject: [PATCH 4/8] add support for f32 output dtype --- .../basic/matrix_multiplication/makefile-common | 4 ++++ .../matrix_multiplication/single_core/aie2.py | 15 +++++++++++---- .../matrix_multiplication/whole_array/aie2.py | 10 ++++++++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index a0fcf77986..bef5598229 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -55,6 +55,10 @@ ifeq ($(dtype_out),i16) dtype_out_cpp=int16_t dtype_acc_cpp=int16_t endif +ifeq ($(dtype_out),f32) + dtype_out_cpp=float + dtype_acc_cpp=float +endif trace_size?=65536 diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 787e3f26b9..afb5b3738e 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -26,8 +26,12 @@ def main(): argparser.add_argument("-m", type=int, default=64) argparser.add_argument("-k", type=int, default=64) argparser.add_argument("-n", type=int, default=64) - argparser.add_argument("--dtype_in", type=str, choices=["bf16", "i16"], default="bf16") - argparser.add_argument("--dtype_out", type=str, choices=["bf16", "i16"], default="bf16") + argparser.add_argument( + "--dtype_in", type=str, choices=["bf16", "i16"], default="bf16" + ) + argparser.add_argument( + "--dtype_out", type=str, choices=["bf16", "i16", "f32"], default="bf16" + ) args = argparser.parse_args() my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out) @@ -69,11 +73,12 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str): dtype_out = T.bf16 elif dtype_out_str == "i16": dtype_out = T.i16 + elif dtype_out_str == "f32": + dtype_out = T.f32 A_sz = M * K B_sz = K * N C_sz = M * N - C_sz_in_bytes = C_sz * 2 M_div_m = M // m K_div_k = K // k @@ -88,6 +93,8 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str): with mlir_mod_ctx() as ctx: + C_sz_in_bytes = C_sz * dtype_out().width // 8 + @device(AIEDevice.npu1_1col) def device_body(): memref_a_ty = T.memref(m, k, dtype_in()) @@ -236,7 +243,7 @@ def sequence(A, B, C): # only do 5 tile rows at a time before synchronizing, so we can reuse BDs rows_per_block = 5 for tile_row_block in range(ceildiv(M_div_m, rows_per_block)): - C_row_offset = tile_row_block * rows_per_block * m * N * 2 + C_row_offset = tile_row_block * rows_per_block * m * N num_tile_rows = min( [rows_per_block, M_div_m - tile_row_block * rows_per_block] ) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 42391c4d3a..062b7ed212 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -27,8 +27,12 @@ def main(): argparser.add_argument("-k", type=int, default=64) argparser.add_argument("-n", type=int, default=64) argparser.add_argument("--n-aie-cols", type=int, choices=[1, 2, 4], default=4) - argparser.add_argument("--dtype_in", type=str, choices=["bf16", "i16"], default="bf16") - argparser.add_argument("--dtype_out", type=str, choices=["bf16", "i16"], default="bf16") + argparser.add_argument( + "--dtype_in", type=str, choices=["bf16", "i16"], default="bf16" + ) + argparser.add_argument( + "--dtype_out", type=str, choices=["bf16", "i16", "f32"], default="bf16" + ) args = argparser.parse_args() with mlir_mod_ctx() as ctx: my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.n_aie_cols, args.dtype_in, args.dtype_out) @@ -55,6 +59,8 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str): dtype_out = T.bf16 elif dtype_out_str == "i16": dtype_out = T.i16 + elif dtype_out_str == "f32": + dtype_out = T.f32 if dtype_in_str == "bf16": r = 4 From 1fa00df82c7e1ca43b5e292691a18f25fe0c0f66 Mon Sep 17 00:00:00 2001 From: andrej Date: Mon, 8 Jul 2024 10:42:15 -0600 Subject: [PATCH 5/8] format --- .../matrix_multiplication/single_core/aie2.py | 11 +++++++--- .../matrix_multiplication/whole_array/aie2.py | 20 ++++++++++++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index afb5b3738e..96f7cb5b0d 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -33,7 +33,9 @@ def main(): "--dtype_out", type=str, choices=["bf16", "i16", "f32"], default="bf16" ) args = argparser.parse_args() - my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out) + my_matmul( + args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out + ) def ceildiv(a, b): @@ -106,14 +108,17 @@ def device_body(): ofifo_memref_c_ty = TypeAttr.get(ObjectFifoType.get(memref_c_ty)) # AIE Core Function declarations - zero_scalar = external_func(f"zero_scalar_{dtype_out_str}", inputs=[memref_c_ty]) + zero_scalar = external_func( + f"zero_scalar_{dtype_out_str}", inputs=[memref_c_ty] + ) zero = external_func(f"zero_{dtype_out_str}", inputs=[memref_c_ty]) matmul_scalar = external_func( f"matmul_scalar_{dtype_in_str}_{dtype_out_str}", inputs=[memref_a_ty, memref_b_ty, memref_c_ty], ) matmul = external_func( - f"matmul_{dtype_in_str}_{dtype_out_str}", inputs=[memref_a_ty, memref_b_ty, memref_c_ty] + f"matmul_{dtype_in_str}_{dtype_out_str}", + inputs=[memref_a_ty, memref_b_ty, memref_c_ty], ) # Tile declarations diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 062b7ed212..16502efd14 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -35,7 +35,17 @@ def main(): ) args = argparser.parse_args() with mlir_mod_ctx() as ctx: - my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.n_aie_cols, args.dtype_in, args.dtype_out) + my_matmul( + args.M, + args.K, + args.N, + args.m, + args.k, + args.n, + args.n_aie_cols, + args.dtype_in, + args.dtype_out, + ) # print(ctx.module.operation.verify()) print(ctx.module) @@ -71,7 +81,6 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str): s = 4 t = 4 - # Input matrix A: # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These # blocks are _broadcast_ across AIE core columns, then _distributed_ across @@ -125,7 +134,9 @@ def device_body(): # AIE Core Function declarations zero_scalar = external_func("zero_scalar_bf16", inputs=[C_l1_memref_ty]) - zero_scalar = external_func(f"zero_scalar_{dtype_out_str}", inputs=[C_l1_memref_ty]) + zero_scalar = external_func( + f"zero_scalar_{dtype_out_str}", inputs=[C_l1_memref_ty] + ) zero = external_func(f"zero_{dtype_out_str}", inputs=[C_l1_memref_ty]) matmul_scalar = external_func( "matmul_scalar_bf16_bf16", @@ -137,10 +148,9 @@ def device_body(): ) matmul = external_func( f"matmul_{dtype_in_str}_{dtype_out_str}", - inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty] + inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty], ) - # Tile declarations as tile[row][col] tiles = [ [tile(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6) From 6e4bd887bc7aed174afaece52f84247351a6be6c Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 12 Jul 2024 11:14:33 -0600 Subject: [PATCH 6/8] make default data types for matmul integer; check for strict equivalence with integer outputs; add i32 output type support --- aie_kernels/aie2/mm.cc | 18 ++++++ .../basic/matrix_multiplication/common.h | 62 +++++++++++++++---- .../matrix_multiplication/makefile-common | 8 ++- .../single_core/Makefile | 2 +- .../matrix_multiplication/single_core/aie2.py | 8 ++- .../basic/matrix_multiplication/test.cpp | 28 +++++---- .../whole_array/Makefile | 2 +- .../matrix_multiplication/whole_array/aie2.py | 8 ++- 8 files changed, 101 insertions(+), 35 deletions(-) diff --git a/aie_kernels/aie2/mm.cc b/aie_kernels/aie2/mm.cc index 0444fa6018..e78bab49b3 100644 --- a/aie_kernels/aie2/mm.cc +++ b/aie_kernels/aie2/mm.cc @@ -366,6 +366,23 @@ void matmul_vectorized_4x4x4_i16_i16(const int16 *__restrict pA, pC); } +template +void matmul_vectorized_4x4x4_i16_i32(const int16 *__restrict pA, + const int16 *__restrict pB, + int32 *__restrict pC) { + // matmul_vectorized operates on two 4x4 input blocks of A, and two 4x4 input + // blocks of B in each iteration. Make sure we have at least 2 blocks in each + // dimension, and that our input matrix is evenly divisible. + constexpr int r = 4; + constexpr int s = 4; + constexpr int t = 4; + static_assert(m % (2 * r) == 0 && m / (2 * r) > 0); + static_assert(k % (2 * s) == 0 && k / (2 * s) > 0); + static_assert(n % (2 * t) == 0 && n / (2 * t) > 0); + return matmul_vectorized(pA, pB, + pC); +} + template void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA, const bfloat16 *__restrict pB, @@ -416,6 +433,7 @@ extern "C" { #define combos(X) \ X(int16, i16, int16, i16, 4, 4, 4) \ + X(int16, i16, int32, i32, 4, 4, 4) \ X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4) \ X(bfloat16, bf16, float, f32, 4, 8, 4) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index 892aba5d6a..770d1db052 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -109,11 +109,16 @@ std::vector load_instr_sequence(std::string instr_path) { // Matrix / Float / Math // -------------------------------------------------------------------------- -static inline std::int16_t random_int16_t() { +template +static inline T get_random(); + +template <> +std::int16_t get_random() { return (std::int16_t)rand() % 0x10000; } -static inline std::bfloat16_t random_bfloat16_t() { +template <> +std::bfloat16_t get_random() { // Random numbers should NOT be uniformly between 0 and 1, because that // would make the matrix product AB always close to 1. return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); @@ -165,6 +170,31 @@ bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, return diff < std::max(abs_th, epsilon * norm); } +template +static inline float get_abs_tol(); +template +static inline float get_rel_tol(); + +template <> +float get_abs_tol() { + return 0.0; +} + +template <> +float get_abs_tol() { + return 0.5; +} + +template <> +float get_rel_tol() { + return 0.0; +} + +template <> +float get_rel_tol() { + return 0.05; +} + template void print_matrix(const std::vector matrix, int n_cols, int n_printable_rows = 10, int n_printable_cols = 10, @@ -237,8 +267,14 @@ struct error { template std::optional> -verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual, float abs_tol, float rel_tol) { - if (!nearly_equal(expected, actual, rel_tol, abs_tol)) { +verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual, + float abs_tol, float rel_tol) { + bool match = expected == actual; + if (abs_tol > 0 || rel_tol > 0) { + // Allow for some tolerance for float data types + match = nearly_equal(expected, actual, rel_tol, abs_tol); + } + if (!match) { return (struct error){row, col, expected, actual}; } return std::nullopt; @@ -273,8 +309,8 @@ void print_progress_bar(std::ostream &os, double progress, int len = 75) { template int verify(int M, int N, int K, std::vector A, std::vector B, - std::vector C, int verbosity = 0, - float abs_tol = 0.5, float rel_tol = 0.05) { + std::vector C, int verbosity = 0, float abs_tol = 0.5, + float rel_tol = 0.05) { int n_errors = 0; std::vector> errors; Tout max_rel_error = (Tout)0.0f; @@ -284,9 +320,9 @@ int verify(int M, int N, int K, std::vector A, std::vector B, for (int row = 0; row < M; row++) { for (int col = 0; col < N; col++) { - std::optional> error = verify_single( - std::cout, row, col, CRef[row * N + col], C[row * N + col], - abs_tol, rel_tol); + std::optional> error = + verify_single(std::cout, row, col, CRef[row * N + col], + C[row * N + col], abs_tol, rel_tol); if (error.has_value()) { if (n_errors < max_printable_errors) { errors.push_back(*error); @@ -316,8 +352,8 @@ int verify(int M, int N, int K, std::vector A, std::vector B, template int verify_stochastic(int M, int N, int K, std::vector A, std::vector B, std::vector C, int n_samples, - int verbosity = 0, - float abs_tol=0.5, float rel_tol=0.05) { + int verbosity = 0, float abs_tol = 0.5, + float rel_tol = 0.05) { std::mt19937 rng; auto rows = std::views::iota(0, M); auto cols = std::views::iota(0, N); @@ -343,8 +379,8 @@ int verify_stochastic(int M, int N, int K, std::vector A, print_progress_bar(std::cerr, progress); } Tout ref = mul_acc(M, N, K, row, col, A, B); - std::optional> error = - verify_single(std::cout, row, col, ref, C[row * N + col], abs_tol, rel_tol); + std::optional> error = verify_single( + std::cout, row, col, ref, C[row * N + col], abs_tol, rel_tol); if (error.has_value()) { if (n_errors < max_printable_errors) { errors.push_back(*error); diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index bef5598229..ba21462442 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -37,8 +37,8 @@ include ${current_dir}../../makefile-common M?=512 K?=512 N?=512 -dtype_in?=bf16 -dtype_out?=bf16 +dtype_in?=i16 +dtype_out?=i32 ifeq ($(dtype_in),bf16) dtype_in_cpp=std::bfloat16_t @@ -55,6 +55,10 @@ ifeq ($(dtype_out),i16) dtype_out_cpp=int16_t dtype_acc_cpp=int16_t endif +ifeq ($(dtype_out),i32) + dtype_out_cpp=int32_t + dtype_acc_cpp=int32_t +endif ifeq ($(dtype_out),f32) dtype_out_cpp=float dtype_acc_cpp=float diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile b/programming_examples/basic/matrix_multiplication/single_core/Makefile index a1da00108f..3fcab3f24d 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/Makefile +++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile @@ -18,7 +18,7 @@ K?=256 N?=256 m?=64 k?=64 -n?=64 +n?=32 kernels=mm_${m}x${k}x${n} aieargs+=-m $m -k $k -n $n diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 96f7cb5b0d..a6ee2e8198 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -25,12 +25,12 @@ def main(): argparser.add_argument("-N", type=int, default=256) argparser.add_argument("-m", type=int, default=64) argparser.add_argument("-k", type=int, default=64) - argparser.add_argument("-n", type=int, default=64) + argparser.add_argument("-n", type=int, default=32) argparser.add_argument( - "--dtype_in", type=str, choices=["bf16", "i16"], default="bf16" + "--dtype_in", type=str, choices=["bf16", "i16"], default="i16" ) argparser.add_argument( - "--dtype_out", type=str, choices=["bf16", "i16", "f32"], default="bf16" + "--dtype_out", type=str, choices=["bf16", "i16", "f32", "i32"], default="i32" ) args = argparser.parse_args() my_matmul( @@ -77,6 +77,8 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str): dtype_out = T.i16 elif dtype_out_str == "f32": dtype_out = T.f32 + elif dtype_out_str == "i32": + dtype_out = T.i32 A_sz = M * K B_sz = K * N diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp index 645bef5f7b..378f81a407 100644 --- a/programming_examples/basic/matrix_multiplication/test.cpp +++ b/programming_examples/basic/matrix_multiplication/test.cpp @@ -49,8 +49,10 @@ using ACC_DATATYPE = DTYPE_ACC; constexpr long long verify_stochastic_threshold = 1024 * 1024 * 1024; constexpr int verify_stochastic_n_samples = 1000; -constexpr float abs_tol = 0.5; -constexpr float rel_tol = 0.05; +// Verification tolerance +// See "Note on Numerical Tolerances" in README.md +float abs_tol = matmul_common::get_abs_tol(); +float rel_tol = matmul_common::get_rel_tol(); namespace po = boost::program_options; @@ -154,20 +156,20 @@ int main(int argc, const char *argv[]) { A_DATATYPE *bufA = bo_a.map(); std::vector AVec(A_VOLUME); for (int i = 0; i < A_VOLUME; i++) { - AVec[i] = matmul_common::random_bfloat16_t(); - //AVec[i] = i; + AVec[i] = matmul_common::get_random(); + // AVec[i] = i; } memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE))); B_DATATYPE *bufB = bo_b.map(); std::vector BVec(B_VOLUME); for (int i = 0; i < B_VOLUME; i++) { - BVec[i] = matmul_common::random_bfloat16_t(); + BVec[i] = matmul_common::get_random(); // Diagonal: - //if(i % N == i / N) { - // BVec[i] = 1.0; - //} else { - // BVec[i] = 0.0; - //} + // if(i % N == i / N) { + // BVec[i] = 1.0; + // } else { + // BVec[i] = 0.0; + // } } memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); @@ -179,7 +181,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 2) { std::cout << "DTYPE_IN = " XSTR(DTYPE_IN) "\n"; std::cout << "DTYPE_OUT = " XSTR(DTYPE_OUT) "\n"; - std::cout << "Verification tolerance " << abs_tol << " absolute, " << rel_tol << " relative.\n"; + std::cout << "Verification tolerance " << abs_tol << " absolute, " + << rel_tol << " relative.\n"; std::cout << "A = \n"; matmul_common::print_matrix(AVec, K); std::cout << "B = \n"; @@ -239,7 +242,8 @@ int main(int argc, const char *argv[]) { if (do_verify_stochastic) { errors = matmul_common::verify_stochastic( - M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity, abs_tol, rel_tol); + M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity, + abs_tol, rel_tol); } else { errors = matmul_common::verify( M, N, K, AVec, BVec, CVec, abs_tol, rel_tol); diff --git a/programming_examples/basic/matrix_multiplication/whole_array/Makefile b/programming_examples/basic/matrix_multiplication/whole_array/Makefile index 31ee48950d..127606f721 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/Makefile +++ b/programming_examples/basic/matrix_multiplication/whole_array/Makefile @@ -15,7 +15,7 @@ M?=640 K?=896 N?=768 m?=16 -k?=64 +k?=32 n?=48 n_aie_cols?=2 diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 16502efd14..9aaad9f252 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -25,13 +25,13 @@ def main(): argparser.add_argument("-N", type=int, default=512) argparser.add_argument("-m", type=int, default=64) argparser.add_argument("-k", type=int, default=64) - argparser.add_argument("-n", type=int, default=64) + argparser.add_argument("-n", type=int, default=32) argparser.add_argument("--n-aie-cols", type=int, choices=[1, 2, 4], default=4) argparser.add_argument( - "--dtype_in", type=str, choices=["bf16", "i16"], default="bf16" + "--dtype_in", type=str, choices=["bf16", "i16"], default="i16" ) argparser.add_argument( - "--dtype_out", type=str, choices=["bf16", "i16", "f32"], default="bf16" + "--dtype_out", type=str, choices=["bf16", "i16", "f32", "i32"], default="i16" ) args = argparser.parse_args() with mlir_mod_ctx() as ctx: @@ -71,6 +71,8 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str): dtype_out = T.i16 elif dtype_out_str == "f32": dtype_out = T.f32 + elif dtype_out_str == "i32": + dtype_out = T.i32 if dtype_in_str == "bf16": r = 4 From 531e0c81b35c00cf07a438097890ae69f9007fcd Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 12 Jul 2024 11:15:52 -0600 Subject: [PATCH 7/8] add tolerances for other output data types --- .../basic/matrix_multiplication/common.h | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index 770d1db052..cba6ff6363 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -175,6 +175,11 @@ static inline float get_abs_tol(); template static inline float get_rel_tol(); +template <> +float get_abs_tol() { + return 0.0; +} + template <> float get_abs_tol() { return 0.0; @@ -185,6 +190,16 @@ float get_abs_tol() { return 0.5; } +template <> +float get_abs_tol() { + return 0.5; +} + +template <> +float get_rel_tol() { + return 0.0; +} + template <> float get_rel_tol() { return 0.0; @@ -195,6 +210,11 @@ float get_rel_tol() { return 0.05; } +template <> +float get_rel_tol() { + return 0.05; +} + template void print_matrix(const std::vector matrix, int n_cols, int n_printable_rows = 10, int n_printable_cols = 10, From a67f9b6f39ca61a5197c97fbf9ef7b22016e481d Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 12 Jul 2024 11:31:28 -0600 Subject: [PATCH 8/8] add note on numerical tolerances to readme --- .../basic/matrix_multiplication/README.md | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/programming_examples/basic/matrix_multiplication/README.md b/programming_examples/basic/matrix_multiplication/README.md index 88b701ffa2..7b001744b5 100644 --- a/programming_examples/basic/matrix_multiplication/README.md +++ b/programming_examples/basic/matrix_multiplication/README.md @@ -16,4 +16,22 @@ Subdirectories in this directory contain example designs that implement matrix m * [`single_core`](single_core) - This design performs matrix-matrix multiplication on a single AI Engine core. * [`whole_array`](whole_array) - This design evolves `single_core`, by splitting the computation and parallelizing it. It utilizes all available AI Engine cores simultaneously. -* [`matrix_vector`](matrix_vector) - This design is a specialization to the matrix-vector-multiplication case, which poses unique challenges due to lower computation density. *Work in progress.* \ No newline at end of file +* [`matrix_vector`](matrix_vector) - This design is a specialization to the matrix-vector-multiplication case, which poses unique challenges due to lower computation density. *Work in progress.* + +## Note on Numerical Tolerances + +This directory contains verification code that ensures the designs in the subdirectories produce the correct output. + +The designs can be configured to work on different input and output data types, based on the Makefile variables `dtype_in` and `dtype_out`. +In the default configuration, all designs consume integer intputs and produce integer outputs. +For this case, the verification checks for strict equivalence between the reference output computed on the host CPU and the output calculated on the AI Engine. +That is, verification will only pass for integer data types if the output is equivalent bit-by-bit. + +For floating point data types, the verification code allows the AI Engine output to deviate from the reference calculated on the host CPU by some limited maximal relative and absolute tolerance (defined in `common.h`). +This standard practice is necessary for the following reasons: + + - Operations on IEEE 754 floating point values are not commutative. That is, the order of operations can affect the results. All designs in the subdirectories perform tiling of the input matrices, multiplying and accumulating sub-matrices in chunks. The reference calculation code on the CPU, on the other hand, does not perform tiling. As such, some differences due to non-commutativity are expected. + - The reference on the host CPU is always computed in `float32`, even if the input data type is `bfloat16`, since the host CPU does not support native `bfloat16` multiplication. This means results are calculated with higher precision on the CPU and subsequently truncated, whereas the AI Engine is able to calculate results in a more performant manner thanks to natively using the lower precision data type. + - If the output datatype is lower-precision than the accumulation data type, the tiling in the `K` dimension affects the results. For example, when multiplying `bfloat16` numbers, the AI Engine accumulates results in higher-precision `float32`. Our designs perform such accumulation for `k` (tiling size in `K` dimension) times before writing the results back into the output buffer. If the output buffer is lower-precision, results are truncated at that time. A larger `k` dimension means fewer such truncations take place. The AI Engine also provides a higher-precision "cascade" data path, which can be used to accumulate results between cores, although none of the designs in this directory make use of this currently. + +In summary, different choices of data types, tiling strategies, and usage of AI Engine components, can all affect floating point results in slight ways. Deciding on different choices for these factors presents interesting trade-offs that must be considered on a case-by-case basis for the application at hand.