From 68bf51268dd3070516d6c78434b6ccd685979799 Mon Sep 17 00:00:00 2001 From: Zhou Yaoyang Date: Wed, 4 Dec 2024 10:39:33 +0800 Subject: [PATCH] feat(memory): add mem latency benchmark Co-authored-by: jueshiwenli <275626310@qq.com> --- apps/mem_test/mem_test_latency/Makefile | 9 ++ apps/mem_test/mem_test_latency/README.md | 47 +++++++++++ .../include/mem_test_latency.h | 83 +++++++++++++++++++ apps/mem_test/mem_test_latency/make_bin.sh | 4 + .../mem_test_latency/mem_test_latency.c | 24 ++++++ 5 files changed, 167 insertions(+) create mode 100644 apps/mem_test/mem_test_latency/Makefile create mode 100644 apps/mem_test/mem_test_latency/README.md create mode 100644 apps/mem_test/mem_test_latency/include/mem_test_latency.h create mode 100644 apps/mem_test/mem_test_latency/make_bin.sh create mode 100644 apps/mem_test/mem_test_latency/mem_test_latency.c diff --git a/apps/mem_test/mem_test_latency/Makefile b/apps/mem_test/mem_test_latency/Makefile new file mode 100644 index 00000000..c86c43dd --- /dev/null +++ b/apps/mem_test/mem_test_latency/Makefile @@ -0,0 +1,9 @@ +NAME_1 = mem_test_latency +CFLAGS += -DINST=$(INST) -g +NAME = build-$(NAME_1)-$(INST) +$(NAME).c:mem_test_latency.c + cp mem_test_latency.c $(NAME).c +SRCS = $(NAME).c +include $(AM_HOME)/Makefile.app +clean-build: + rm -f build-* \ No newline at end of file diff --git a/apps/mem_test/mem_test_latency/README.md b/apps/mem_test/mem_test_latency/README.md new file mode 100644 index 00000000..c541f3fb --- /dev/null +++ b/apps/mem_test/mem_test_latency/README.md @@ -0,0 +1,47 @@ +# Build bmk + +Remember to set `AM_HOME` to the root of this repo. + +Clear old build +``` +rm -rf $AM_HOME/am/build build +``` + +Build latency bmk with specific footprint size, given than L3 size is 16: +``` +make ARCH=riscv64-xs INST=16 +``` + +Note that `INST` here will be multiplied by 1MB to flush caches + +# Build DUT with prefetcher off + +To avoid prefetching, please remember to turn off prefetchers. + +# Run + +With NEMU to test building correctness: +``` +$NEMU_HOME/build/riscv64-nemu-interpreter -b build/build-mem_test_latency-16-riscv64-xs.bin +``` +NEMU will print something like +``` +Welcome to riscv64-NEMU! +For help, type "help" +the test of load-add-load hit result +instrs 2015 cycles 2015 +``` +Note that `cycles` is meaningless in NEMU + +Then test it with Xiangshan: +``` +/path/to/xiangshan/build/emu -i build/build-mem_test_latency-16-riscv64-xs.bin +``` + +# Compute latency + +Divide `cycles` printed by Xiangshan emu with 1000, we get the load-to-use latency. + +Load-to-use latency: https://www.quora.com/What-does-it-mean-by-load-use-penalty-in-computer-architecture-Does-it-have-any-other-name + +If prefetchers are turned off, Load-to-use latency consists of L1+L2+L3+Memory latency diff --git a/apps/mem_test/mem_test_latency/include/mem_test_latency.h b/apps/mem_test/mem_test_latency/include/mem_test_latency.h new file mode 100644 index 00000000..0972341b --- /dev/null +++ b/apps/mem_test/mem_test_latency/include/mem_test_latency.h @@ -0,0 +1,83 @@ +#include +#include + +#define BYTE (1) +#define KB (1024*BYTE) +#define MB (1024*KB) +#define GB (1024*MB) + +#define _PERF_TEST_ADDR_BASE 0x80010000 +//#define _PERF_TEST_ADDR_BASE 0x100010000 +//#define _PERF_TEST_ADDR_BASE 0x80050000 +#define _PERF_CACHELINE_SIZE_BYTE (8 * BYTE) +//#define _PERF_L1_SIZE_BYTE (32 * KB) +//#define _PERF_L1_SIZE_BYTE (33 * KB) +#define _PERF_L1_SIZE_BYTE (64 * KB) +#define _PERF_L2_SIZE_BYTE (1 * MB) +#define _PERF_L3_SIZE_BYTE (6 * MB) +#define _PERF_MEM_SIZE_BYTE (32 *MB) +#define _TEST_NUM_SIZE (32 * KB) +#define _STEP_SIZE (1 *MB) + +void full_cache_warmup_i(uint64_t base_addr,uint64_t end_addr,uint64_t step,int choose){ + uint64_t num = 0; + assert(step % 8 ==0); + assert(step >= 8); + for(uint64_t cur_addr = base_addr ; cur_addr < end_addr;){ + uint64_t next_addr = cur_addr + step; + //choose ==0 test l1, next_address=address + if(choose ==0){ + *((uint64_t*)cur_addr) = cur_addr; + } + else{ + //test l2/l3,next_address= address +64, visit next block + *((uint64_t*)cur_addr) = cur_addr + 64; + } + cur_addr = next_addr; + num ++; + } +} + +__attribute__((aligned(256))) +void cache_iloop(unsigned long long* instr_count, unsigned long long* cycle_count){ + *instr_count = 0; + *cycle_count = 0; + asm volatile( + "jal zero, init;" + "xor s8 , zero , zero;" + + "loop:" + "ld s8,0(s6);" + "ld s6,0(s8);" + + "addi s4 , s4 , 1;" + "bleu s4,s5,loop;" + + "jal zero ,term;" + + "init:" + "li s4 , 0;" + "li s5 , 500;" + "li s6 , 0x80010000;" + "li s7 , 0;" + "li s8 , 0;" + + + "csrr s9 , mcycle;" + "csrr s10, minstret;" + + "jal zero, loop;" + "term:" + "csrr s11 , mcycle;" + "csrr t3 , minstret;" + + "subw %[c], s11 , s9;" + "subw %[i], t3 , s10;" + + : [c] "=r" (*cycle_count),[i] "=r" (*instr_count) + : + : "zero","s4","s5","s6","s7","s8","s9","s10","s11","t3","t4","t5","t6","cc" + + ); + +} diff --git a/apps/mem_test/mem_test_latency/make_bin.sh b/apps/mem_test/mem_test_latency/make_bin.sh new file mode 100644 index 00000000..955c9dab --- /dev/null +++ b/apps/mem_test/mem_test_latency/make_bin.sh @@ -0,0 +1,4 @@ +for ((i=10;i<20;i++)) +do + make ARCH=riscv64-xs INST=$i +done \ No newline at end of file diff --git a/apps/mem_test/mem_test_latency/mem_test_latency.c b/apps/mem_test/mem_test_latency/mem_test_latency.c new file mode 100644 index 00000000..b9486e5c --- /dev/null +++ b/apps/mem_test/mem_test_latency/mem_test_latency.c @@ -0,0 +1,24 @@ +#include +#include "mem_test_latency.h" + +#define xstr(s) str(s) +#define str(s) #s + +int main(){ + unsigned long long busy_cycles; + unsigned long long busy_instrs; + //control + uint64_t cache_warmup_size_count = atoi(xstr(INST)); + //_PERF_TEST_ADDR_BASE 0x80010000 + uint64_t start_addr = _PERF_TEST_ADDR_BASE; + //init end address, step = 256k,input step size + //_PERF_L1_SIZE_BYTE (64 * KB) _TEST_NUM_SIZE (32 * KB) _STEP_SIZE (256 *KB) + uint64_t end_addr = (_PERF_TEST_ADDR_BASE + _PERF_L2_SIZE_BYTE + cache_warmup_size_count *_STEP_SIZE); + //make cache warmup put the test address in l1/l2/l3 _PERF_CACHELINE_SIZE_BYTE (8 * BYTE) + full_cache_warmup_i(start_addr,end_addr,_PERF_CACHELINE_SIZE_BYTE,cache_warmup_size_count); + //cache latency test + cache_iloop(&busy_instrs,&busy_cycles); + + printf("the test of load-add-load hit result\n"); + printf("instrs %d cycles %d\n",busy_instrs,busy_cycles); +} \ No newline at end of file