Skip to content

Commit a44efe6

Browse files
committed
Use unplaced GEMM, python code refactor, and add throughput metric
1 parent 63caafa commit a44efe6

File tree

3 files changed

+489
-529
lines changed

3 files changed

+489
-529
lines changed

example/gemm/CMakeLists.txt

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ function(add_aie_gemm_xclbin m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_SIZE AR
1616
add_aie_design(${XCLBIN_NAME}
1717
XCLBIN_ONLY
1818
PYTHON gemm.py
19-
PYTHON_FLAGS --dev ${DEVICE} -M ${DUMMY_M} -K ${DUMMY_K} -N ${DUMMY_N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} --prio-accuracy --output-file-path ${CMAKE_BINARY_DIR}/aie/${XCLBIN_NAME}.mlir
19+
PYTHON_FLAGS --dev ${DEVICE} -M ${DUMMY_M} -K ${DUMMY_K} -N ${DUMMY_N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} ${PRIO_ACC_STR} --output-file-path ${CMAKE_BINARY_DIR}/aie/${XCLBIN_NAME}.mlir
2020
AIE_CORE_KERNELS ${ARCHIVE_NAME}
2121
EXTRA_AIECC_FLAGS --dynamic-objFifos
2222
OUTPUT_XCLBIN GEMM_XCLBIN_${XCLBIN_NAME})
@@ -43,7 +43,7 @@ function(add_aie_gemm_design M K N m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_S
4343
add_aie_design(${EXAMPLE}
4444
INSTS_ONLY
4545
PYTHON gemm.py
46-
PYTHON_FLAGS --dev ${DEVICE} -M ${M} -K ${K} -N ${N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} --prio-accuracy --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir
46+
PYTHON_FLAGS --dev ${DEVICE} -M ${M} -K ${K} -N ${N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} ${PRIO_ACC_STR} --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir
4747
EXTRA_AIECC_FLAGS --dynamic-objFifos
4848
OUTPUT_INSTS GEMM_INSTS)
4949

@@ -69,7 +69,8 @@ function(add_aie_gemm_design M K N m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_S
6969
"PASS!"
7070
METRICS
7171
"Latency" [=[Latency \(us\): (?P<metric>\d+)]=]
72-
"Bandwidth" [=[Effective Bandwidth: (?P<metric>[\d\.e\+-]+) GB/s]=])
72+
"Bandwidth" [=[Effective Bandwidth: (?P<metric>[\d\.e\+-]+) GB/s]=]
73+
"GFLOP/s" [=[Throughput:\s*(?P<metric>[\d\.e\+-]+) GFLOP/s]=])
7374
endfunction()
7475

7576
set(M_LIST "2048")
@@ -86,15 +87,22 @@ set(ARCHIVE_NAME "gemm_${m}x${k}x${n}_archive.a")
8687
set(AIE_BUILD_DIR ${CMAKE_BINARY_DIR}/aie)
8788
set(ARCHIVE_PATH "${AIE_BUILD_DIR}/${ARCHIVE_NAME}")
8889

90+
set(PRIO_ACCURACY True)
8991
set(EMULATE_BFLOAT16_MMUL_WITH_BFP16 False)
9092
set (B_COL_MAJ 0)
9193
set (C_COL_MAJ 0)
9294

93-
set(MM_KERNEL_DEFINES "DIM_M=${m}" "DIM_K=${k}" "DIM_N=${n}" "bf16_f32_ONLY" "ROUND_CONV_EVEN")
95+
set(MM_KERNEL_DEFINES "DIM_M=${m}" "DIM_K=${k}" "DIM_N=${n}" "ROUND_CONV_EVEN")
9496
if (EMULATE_BFLOAT16_MMUL_WITH_BFP16)
9597
set(EMULATE_STR --emulate-bf16-mmul-with-bfp16)
9698
list(APPEND MM_KERNEL_DEFINES "AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16")
9799
endif()
100+
if (PRIO_ACCURACY)
101+
set(PRIO_ACC_STR --prio-accuracy)
102+
list(APPEND MM_KERNEL_DEFINES "bf16_f32_ONLY")
103+
else()
104+
list(APPEND MM_KERNEL_DEFINES "bf16_bf16_ONLY")
105+
endif()
98106
if (B_COL_MAJ)
99107
list(APPEND MM_KERNEL_DEFINES "B_COL_MAJ")
100108
endif()

example/gemm/gemm.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,11 @@ int main(int argc, const char *argv[])
160160
double bandwidth_GBps = (total_bytes / (1024 * 1024 * 1024)) / (npu_time * 1e-6);
161161
std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl;
162162

163+
// Need to cast to long long since M/K/N=2048 would give 17*10^9 ops for example
164+
unsigned long long n_ops = static_cast<unsigned long long>(M) * K * N * 2;
165+
float throughput = n_ops / npu_time / 1e3; // GOP/s
166+
std::cout << "Throughput: " << throughput << " GFLOP/s" << std::endl;
167+
163168
std::bfloat16_t *bufOut1 = bo_out.map<std::bfloat16_t *>();
164169

165170
// Compare with golden reference

0 commit comments

Comments
 (0)