Skip to content

Commit

Permalink
feat(memory): add mem latency benchmark
Browse files Browse the repository at this point in the history
Co-authored-by: jueshiwenli <275626310@qq.com>
  • Loading branch information
shinezyy and jueshiwenli committed Dec 4, 2024
1 parent 6fc8d88 commit 68bf512
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 0 deletions.
9 changes: 9 additions & 0 deletions apps/mem_test/mem_test_latency/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
NAME_1 = mem_test_latency
CFLAGS += -DINST=$(INST) -g
NAME = build-$(NAME_1)-$(INST)
$(NAME).c:mem_test_latency.c
cp mem_test_latency.c $(NAME).c
SRCS = $(NAME).c
include $(AM_HOME)/Makefile.app
clean-build:
rm -f build-*
47 changes: 47 additions & 0 deletions apps/mem_test/mem_test_latency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Build bmk

Remember to set `AM_HOME` to the root of this repo.

Clear old build
```
rm -rf $AM_HOME/am/build build
```

Build latency bmk with specific footprint size, given than L3 size is 16:
```
make ARCH=riscv64-xs INST=16
```

Note that `INST` here will be multiplied by 1MB to flush caches

# Build DUT with prefetcher off

To avoid prefetching, please remember to turn off prefetchers.

# Run

With NEMU to test building correctness:
```
$NEMU_HOME/build/riscv64-nemu-interpreter -b build/build-mem_test_latency-16-riscv64-xs.bin
```
NEMU will print something like
```
Welcome to riscv64-NEMU!
For help, type "help"
the test of load-add-load hit result
instrs 2015 cycles 2015
```
Note that `cycles` is meaningless in NEMU

Then test it with Xiangshan:
```
/path/to/xiangshan/build/emu -i build/build-mem_test_latency-16-riscv64-xs.bin
```

# Compute latency

Divide `cycles` printed by Xiangshan emu with 1000, we get the load-to-use latency.

Load-to-use latency: https://www.quora.com/What-does-it-mean-by-load-use-penalty-in-computer-architecture-Does-it-have-any-other-name

If prefetchers are turned off, Load-to-use latency consists of L1+L2+L3+Memory latency
83 changes: 83 additions & 0 deletions apps/mem_test/mem_test_latency/include/mem_test_latency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include <klib.h>
#include <csr.h>

#define BYTE (1)
#define KB (1024*BYTE)
#define MB (1024*KB)
#define GB (1024*MB)

#define _PERF_TEST_ADDR_BASE 0x80010000
//#define _PERF_TEST_ADDR_BASE 0x100010000
//#define _PERF_TEST_ADDR_BASE 0x80050000
#define _PERF_CACHELINE_SIZE_BYTE (8 * BYTE)
//#define _PERF_L1_SIZE_BYTE (32 * KB)
//#define _PERF_L1_SIZE_BYTE (33 * KB)
#define _PERF_L1_SIZE_BYTE (64 * KB)
#define _PERF_L2_SIZE_BYTE (1 * MB)
#define _PERF_L3_SIZE_BYTE (6 * MB)
#define _PERF_MEM_SIZE_BYTE (32 *MB)
#define _TEST_NUM_SIZE (32 * KB)
#define _STEP_SIZE (1 *MB)

void full_cache_warmup_i(uint64_t base_addr,uint64_t end_addr,uint64_t step,int choose){
uint64_t num = 0;
assert(step % 8 ==0);
assert(step >= 8);
for(uint64_t cur_addr = base_addr ; cur_addr < end_addr;){
uint64_t next_addr = cur_addr + step;
//choose ==0 test l1, next_address=address
if(choose ==0){
*((uint64_t*)cur_addr) = cur_addr;
}
else{
//test l2/l3,next_address= address +64, visit next block
*((uint64_t*)cur_addr) = cur_addr + 64;
}
cur_addr = next_addr;
num ++;
}
}

__attribute__((aligned(256)))
void cache_iloop(unsigned long long* instr_count, unsigned long long* cycle_count){
*instr_count = 0;
*cycle_count = 0;
asm volatile(
"jal zero, init;"
"xor s8 , zero , zero;"

"loop:"
"ld s8,0(s6);"
"ld s6,0(s8);"

"addi s4 , s4 , 1;"
"bleu s4,s5,loop;"

"jal zero ,term;"

"init:"
"li s4 , 0;"
"li s5 , 500;"
"li s6 , 0x80010000;"
"li s7 , 0;"
"li s8 , 0;"


"csrr s9 , mcycle;"
"csrr s10, minstret;"

"jal zero, loop;"
"term:"
"csrr s11 , mcycle;"
"csrr t3 , minstret;"

"subw %[c], s11 , s9;"
"subw %[i], t3 , s10;"

: [c] "=r" (*cycle_count),[i] "=r" (*instr_count)
:
: "zero","s4","s5","s6","s7","s8","s9","s10","s11","t3","t4","t5","t6","cc"

);

}
4 changes: 4 additions & 0 deletions apps/mem_test/mem_test_latency/make_bin.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
for ((i=10;i<20;i++))
do
make ARCH=riscv64-xs INST=$i
done
24 changes: 24 additions & 0 deletions apps/mem_test/mem_test_latency/mem_test_latency.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include <klib.h>
#include "mem_test_latency.h"

#define xstr(s) str(s)
#define str(s) #s

int main(){
unsigned long long busy_cycles;
unsigned long long busy_instrs;
//control
uint64_t cache_warmup_size_count = atoi(xstr(INST));
//_PERF_TEST_ADDR_BASE 0x80010000
uint64_t start_addr = _PERF_TEST_ADDR_BASE;
//init end address, step = 256k,input step size
//_PERF_L1_SIZE_BYTE (64 * KB) _TEST_NUM_SIZE (32 * KB) _STEP_SIZE (256 *KB)
uint64_t end_addr = (_PERF_TEST_ADDR_BASE + _PERF_L2_SIZE_BYTE + cache_warmup_size_count *_STEP_SIZE);
//make cache warmup put the test address in l1/l2/l3 _PERF_CACHELINE_SIZE_BYTE (8 * BYTE)
full_cache_warmup_i(start_addr,end_addr,_PERF_CACHELINE_SIZE_BYTE,cache_warmup_size_count);
//cache latency test
cache_iloop(&busy_instrs,&busy_cycles);

printf("the test of load-add-load hit result\n");
printf("instrs %d cycles %d\n",busy_instrs,busy_cycles);
}

0 comments on commit 68bf512

Please sign in to comment.