Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memory latency #44

Merged
merged 1 commit into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions apps/mem_test/mem_test_latency/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
NAME_1 = mem_test_latency
CFLAGS += -DINST=$(INST) -g
NAME = build-$(NAME_1)-$(INST)
$(NAME).c:mem_test_latency.c
cp mem_test_latency.c $(NAME).c
SRCS = $(NAME).c
include $(AM_HOME)/Makefile.app
clean-build:
rm -f build-*
47 changes: 47 additions & 0 deletions apps/mem_test/mem_test_latency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Build bmk

Remember to set `AM_HOME` to the root of this repo.

Clear old build
```
rm -rf $AM_HOME/am/build build
```

Build latency bmk with specific footprint size, given than L3 size is 16:
```
make ARCH=riscv64-xs INST=16
```

Note that `INST` here will be multiplied by 1MB to flush caches

# Build DUT with prefetcher off

To avoid prefetching, please remember to turn off prefetchers.

# Run

With NEMU to test building correctness:
```
$NEMU_HOME/build/riscv64-nemu-interpreter -b build/build-mem_test_latency-16-riscv64-xs.bin
```
NEMU will print something like
```
Welcome to riscv64-NEMU!
For help, type "help"
the test of load-add-load hit result
instrs 2015 cycles 2015
```
Note that `cycles` is meaningless in NEMU

Then test it with Xiangshan:
```
/path/to/xiangshan/build/emu -i build/build-mem_test_latency-16-riscv64-xs.bin
```

# Compute latency

Divide `cycles` printed by Xiangshan emu with 1000, we get the load-to-use latency.

Load-to-use latency: https://www.quora.com/What-does-it-mean-by-load-use-penalty-in-computer-architecture-Does-it-have-any-other-name

If prefetchers are turned off, Load-to-use latency consists of L1+L2+L3+Memory latency
83 changes: 83 additions & 0 deletions apps/mem_test/mem_test_latency/include/mem_test_latency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include <klib.h>
#include <csr.h>

#define BYTE (1)
#define KB (1024*BYTE)
#define MB (1024*KB)
#define GB (1024*MB)

#define _PERF_TEST_ADDR_BASE 0x80010000
//#define _PERF_TEST_ADDR_BASE 0x100010000
//#define _PERF_TEST_ADDR_BASE 0x80050000
#define _PERF_CACHELINE_SIZE_BYTE (8 * BYTE)
//#define _PERF_L1_SIZE_BYTE (32 * KB)
//#define _PERF_L1_SIZE_BYTE (33 * KB)
#define _PERF_L1_SIZE_BYTE (64 * KB)
#define _PERF_L2_SIZE_BYTE (1 * MB)
#define _PERF_L3_SIZE_BYTE (6 * MB)
#define _PERF_MEM_SIZE_BYTE (32 *MB)
#define _TEST_NUM_SIZE (32 * KB)
#define _STEP_SIZE (1 *MB)

void full_cache_warmup_i(uint64_t base_addr,uint64_t end_addr,uint64_t step,int choose){
uint64_t num = 0;
assert(step % 8 ==0);
assert(step >= 8);
for(uint64_t cur_addr = base_addr ; cur_addr < end_addr;){
uint64_t next_addr = cur_addr + step;
//choose ==0 test l1, next_address=address
if(choose ==0){
*((uint64_t*)cur_addr) = cur_addr;
}
else{
//test l2/l3,next_address= address +64, visit next block
*((uint64_t*)cur_addr) = cur_addr + 64;
}
cur_addr = next_addr;
num ++;
}
}

__attribute__((aligned(256)))
void cache_iloop(unsigned long long* instr_count, unsigned long long* cycle_count){
*instr_count = 0;
*cycle_count = 0;
asm volatile(
"jal zero, init;"
"xor s8 , zero , zero;"

"loop:"
"ld s8,0(s6);"
"ld s6,0(s8);"

"addi s4 , s4 , 1;"
"bleu s4,s5,loop;"

"jal zero ,term;"

"init:"
"li s4 , 0;"
"li s5 , 500;"
"li s6 , 0x80010000;"
"li s7 , 0;"
"li s8 , 0;"


"csrr s9 , mcycle;"
"csrr s10, minstret;"

"jal zero, loop;"
"term:"
"csrr s11 , mcycle;"
"csrr t3 , minstret;"

"subw %[c], s11 , s9;"
"subw %[i], t3 , s10;"

: [c] "=r" (*cycle_count),[i] "=r" (*instr_count)
:
: "zero","s4","s5","s6","s7","s8","s9","s10","s11","t3","t4","t5","t6","cc"

);

}
4 changes: 4 additions & 0 deletions apps/mem_test/mem_test_latency/make_bin.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
for ((i=10;i<20;i++))
do
make ARCH=riscv64-xs INST=$i
done
24 changes: 24 additions & 0 deletions apps/mem_test/mem_test_latency/mem_test_latency.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include <klib.h>
#include "mem_test_latency.h"

#define xstr(s) str(s)
#define str(s) #s

int main(){
unsigned long long busy_cycles;
unsigned long long busy_instrs;
//control
uint64_t cache_warmup_size_count = atoi(xstr(INST));
//_PERF_TEST_ADDR_BASE 0x80010000
uint64_t start_addr = _PERF_TEST_ADDR_BASE;
//init end address, step = 256k,input step size
//_PERF_L1_SIZE_BYTE (64 * KB) _TEST_NUM_SIZE (32 * KB) _STEP_SIZE (256 *KB)
uint64_t end_addr = (_PERF_TEST_ADDR_BASE + _PERF_L2_SIZE_BYTE + cache_warmup_size_count *_STEP_SIZE);
//make cache warmup put the test address in l1/l2/l3 _PERF_CACHELINE_SIZE_BYTE (8 * BYTE)
full_cache_warmup_i(start_addr,end_addr,_PERF_CACHELINE_SIZE_BYTE,cache_warmup_size_count);
//cache latency test
cache_iloop(&busy_instrs,&busy_cycles);

printf("the test of load-add-load hit result\n");
printf("instrs %d cycles %d\n",busy_instrs,busy_cycles);
}