-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(memory): add mem latency benchmark
Co-authored-by: jueshiwenli <275626310@qq.com>
- Loading branch information
1 parent
6fc8d88
commit 68bf512
Showing
5 changed files
with
167 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
NAME_1 = mem_test_latency | ||
CFLAGS += -DINST=$(INST) -g | ||
NAME = build-$(NAME_1)-$(INST) | ||
$(NAME).c:mem_test_latency.c | ||
cp mem_test_latency.c $(NAME).c | ||
SRCS = $(NAME).c | ||
include $(AM_HOME)/Makefile.app | ||
clean-build: | ||
rm -f build-* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Build bmk | ||
|
||
Remember to set `AM_HOME` to the root of this repo. | ||
|
||
Clear old build | ||
``` | ||
rm -rf $AM_HOME/am/build build | ||
``` | ||
|
||
Build latency bmk with specific footprint size, given than L3 size is 16: | ||
``` | ||
make ARCH=riscv64-xs INST=16 | ||
``` | ||
|
||
Note that `INST` here will be multiplied by 1MB to flush caches | ||
|
||
# Build DUT with prefetcher off | ||
|
||
To avoid prefetching, please remember to turn off prefetchers. | ||
|
||
# Run | ||
|
||
With NEMU to test building correctness: | ||
``` | ||
$NEMU_HOME/build/riscv64-nemu-interpreter -b build/build-mem_test_latency-16-riscv64-xs.bin | ||
``` | ||
NEMU will print something like | ||
``` | ||
Welcome to riscv64-NEMU! | ||
For help, type "help" | ||
the test of load-add-load hit result | ||
instrs 2015 cycles 2015 | ||
``` | ||
Note that `cycles` is meaningless in NEMU | ||
|
||
Then test it with Xiangshan: | ||
``` | ||
/path/to/xiangshan/build/emu -i build/build-mem_test_latency-16-riscv64-xs.bin | ||
``` | ||
|
||
# Compute latency | ||
|
||
Divide `cycles` printed by Xiangshan emu with 1000, we get the load-to-use latency. | ||
|
||
Load-to-use latency: https://www.quora.com/What-does-it-mean-by-load-use-penalty-in-computer-architecture-Does-it-have-any-other-name | ||
|
||
If prefetchers are turned off, Load-to-use latency consists of L1+L2+L3+Memory latency |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#include <klib.h> | ||
#include <csr.h> | ||
|
||
#define BYTE (1) | ||
#define KB (1024*BYTE) | ||
#define MB (1024*KB) | ||
#define GB (1024*MB) | ||
|
||
#define _PERF_TEST_ADDR_BASE 0x80010000 | ||
//#define _PERF_TEST_ADDR_BASE 0x100010000 | ||
//#define _PERF_TEST_ADDR_BASE 0x80050000 | ||
#define _PERF_CACHELINE_SIZE_BYTE (8 * BYTE) | ||
//#define _PERF_L1_SIZE_BYTE (32 * KB) | ||
//#define _PERF_L1_SIZE_BYTE (33 * KB) | ||
#define _PERF_L1_SIZE_BYTE (64 * KB) | ||
#define _PERF_L2_SIZE_BYTE (1 * MB) | ||
#define _PERF_L3_SIZE_BYTE (6 * MB) | ||
#define _PERF_MEM_SIZE_BYTE (32 *MB) | ||
#define _TEST_NUM_SIZE (32 * KB) | ||
#define _STEP_SIZE (1 *MB) | ||
|
||
void full_cache_warmup_i(uint64_t base_addr,uint64_t end_addr,uint64_t step,int choose){ | ||
uint64_t num = 0; | ||
assert(step % 8 ==0); | ||
assert(step >= 8); | ||
for(uint64_t cur_addr = base_addr ; cur_addr < end_addr;){ | ||
uint64_t next_addr = cur_addr + step; | ||
//choose ==0 test l1, next_address=address | ||
if(choose ==0){ | ||
*((uint64_t*)cur_addr) = cur_addr; | ||
} | ||
else{ | ||
//test l2/l3,next_address= address +64, visit next block | ||
*((uint64_t*)cur_addr) = cur_addr + 64; | ||
} | ||
cur_addr = next_addr; | ||
num ++; | ||
} | ||
} | ||
|
||
__attribute__((aligned(256))) | ||
void cache_iloop(unsigned long long* instr_count, unsigned long long* cycle_count){ | ||
*instr_count = 0; | ||
*cycle_count = 0; | ||
asm volatile( | ||
"jal zero, init;" | ||
"xor s8 , zero , zero;" | ||
|
||
"loop:" | ||
"ld s8,0(s6);" | ||
"ld s6,0(s8);" | ||
|
||
"addi s4 , s4 , 1;" | ||
"bleu s4,s5,loop;" | ||
|
||
"jal zero ,term;" | ||
|
||
"init:" | ||
"li s4 , 0;" | ||
"li s5 , 500;" | ||
"li s6 , 0x80010000;" | ||
"li s7 , 0;" | ||
"li s8 , 0;" | ||
|
||
|
||
"csrr s9 , mcycle;" | ||
"csrr s10, minstret;" | ||
|
||
"jal zero, loop;" | ||
"term:" | ||
"csrr s11 , mcycle;" | ||
"csrr t3 , minstret;" | ||
|
||
"subw %[c], s11 , s9;" | ||
"subw %[i], t3 , s10;" | ||
|
||
: [c] "=r" (*cycle_count),[i] "=r" (*instr_count) | ||
: | ||
: "zero","s4","s5","s6","s7","s8","s9","s10","s11","t3","t4","t5","t6","cc" | ||
|
||
); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
for ((i=10;i<20;i++)) | ||
do | ||
make ARCH=riscv64-xs INST=$i | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#include <klib.h> | ||
#include "mem_test_latency.h" | ||
|
||
#define xstr(s) str(s) | ||
#define str(s) #s | ||
|
||
int main(){ | ||
unsigned long long busy_cycles; | ||
unsigned long long busy_instrs; | ||
//control | ||
uint64_t cache_warmup_size_count = atoi(xstr(INST)); | ||
//_PERF_TEST_ADDR_BASE 0x80010000 | ||
uint64_t start_addr = _PERF_TEST_ADDR_BASE; | ||
//init end address, step = 256k,input step size | ||
//_PERF_L1_SIZE_BYTE (64 * KB) _TEST_NUM_SIZE (32 * KB) _STEP_SIZE (256 *KB) | ||
uint64_t end_addr = (_PERF_TEST_ADDR_BASE + _PERF_L2_SIZE_BYTE + cache_warmup_size_count *_STEP_SIZE); | ||
//make cache warmup put the test address in l1/l2/l3 _PERF_CACHELINE_SIZE_BYTE (8 * BYTE) | ||
full_cache_warmup_i(start_addr,end_addr,_PERF_CACHELINE_SIZE_BYTE,cache_warmup_size_count); | ||
//cache latency test | ||
cache_iloop(&busy_instrs,&busy_cycles); | ||
|
||
printf("the test of load-add-load hit result\n"); | ||
printf("instrs %d cycles %d\n",busy_instrs,busy_cycles); | ||
} |