OpenXiangShan · shinezyy · Dec 4, 2024 · Dec 4, 2024
diff --git a/apps/mem_test/mem_test_latency/Makefile b/apps/mem_test/mem_test_latency/Makefile
@@ -0,0 +1,9 @@
+NAME_1 = mem_test_latency
+CFLAGS += -DINST=$(INST) -g
+NAME = build-$(NAME_1)-$(INST)
+$(NAME).c:mem_test_latency.c
+	cp mem_test_latency.c $(NAME).c
+SRCS = $(NAME).c
+include $(AM_HOME)/Makefile.app
+clean-build:
+	rm -f build-*
diff --git a/apps/mem_test/mem_test_latency/README.md b/apps/mem_test/mem_test_latency/README.md
@@ -0,0 +1,47 @@
+# Build bmk
+
+Remember to set `AM_HOME` to the root of this repo.
+
+Clear old build
+```
+rm -rf $AM_HOME/am/build build
+```
+
+Build latency bmk with specific footprint size, given than L3 size is 16:
+```
+make ARCH=riscv64-xs INST=16
+```
+
+Note that `INST` here will be multiplied by 1MB to flush caches
+
+# Build DUT with prefetcher off
+
+To avoid prefetching, please remember to turn off prefetchers.
+
+# Run
+
+With NEMU to test building correctness:
+```
+$NEMU_HOME/build/riscv64-nemu-interpreter -b build/build-mem_test_latency-16-riscv64-xs.bin
+```
+NEMU will print something like
+```
+Welcome to riscv64-NEMU!
+For help, type "help"
+the test of load-add-load hit result
+instrs 2015 cycles 2015
+```
+Note that `cycles` is meaningless in NEMU
+
+Then test it with Xiangshan:
+```
+/path/to/xiangshan/build/emu -i build/build-mem_test_latency-16-riscv64-xs.bin
+```
+
+# Compute latency
+
+Divide `cycles` printed by Xiangshan emu with 1000, we get the load-to-use latency.
+
+Load-to-use latency: https://www.quora.com/What-does-it-mean-by-load-use-penalty-in-computer-architecture-Does-it-have-any-other-name
+
+If prefetchers are turned off, Load-to-use latency consists of L1+L2+L3+Memory latency
diff --git a/apps/mem_test/mem_test_latency/include/mem_test_latency.h b/apps/mem_test/mem_test_latency/include/mem_test_latency.h
@@ -0,0 +1,83 @@
+#include <klib.h>
+#include <csr.h>
+
+#define BYTE (1)
+#define KB (1024*BYTE)
+#define MB (1024*KB)
+#define GB (1024*MB)
+
+#define _PERF_TEST_ADDR_BASE 0x80010000
+//#define _PERF_TEST_ADDR_BASE 0x100010000
+//#define _PERF_TEST_ADDR_BASE 0x80050000
+#define _PERF_CACHELINE_SIZE_BYTE (8 * BYTE)
+//#define _PERF_L1_SIZE_BYTE (32 * KB)
+//#define _PERF_L1_SIZE_BYTE (33 * KB)
+#define _PERF_L1_SIZE_BYTE (64 * KB)
+#define _PERF_L2_SIZE_BYTE (1 * MB)
+#define _PERF_L3_SIZE_BYTE (6 * MB)
+#define _PERF_MEM_SIZE_BYTE (32 *MB)
+#define _TEST_NUM_SIZE (32 * KB)
+#define _STEP_SIZE (1 *MB)
+
+void full_cache_warmup_i(uint64_t base_addr,uint64_t end_addr,uint64_t step,int choose){
+    uint64_t num = 0;
+    assert(step % 8 ==0);
+    assert(step >= 8);
+    for(uint64_t cur_addr = base_addr ; cur_addr < end_addr;){
+        uint64_t next_addr = cur_addr + step;
+        //choose ==0 test l1, next_address=address
+        if(choose ==0){
+            *((uint64_t*)cur_addr) = cur_addr;
+        }
+        else{
+        //test l2/l3,next_address= address +64, visit next block
+            *((uint64_t*)cur_addr) = cur_addr + 64;
+        }
+        cur_addr = next_addr;
+        num ++;
+    }
+}
+
+__attribute__((aligned(256)))
+void cache_iloop(unsigned long long* instr_count, unsigned long long* cycle_count){
+    *instr_count = 0;
+    *cycle_count = 0;
+    asm volatile(
+            "jal zero, init;"
+            "xor s8  , zero , zero;"
+
+        "loop:" 
+            "ld    s8,0(s6);"
+            "ld    s6,0(s8);"
+
+            "addi s4 , s4 , 1;"
+            "bleu s4,s5,loop;"
+
+            "jal  zero ,term;"
+
+        "init:"
+            "li   s4 , 0;"
+            "li   s5 , 500;"
+            "li   s6 , 0x80010000;"
+            "li   s7 , 0;"
+            "li   s8 , 0;"
+
+
+            "csrr  s9 , mcycle;"
+            "csrr  s10, minstret;"
+
+            "jal   zero, loop;"
+        "term:"
+            "csrr s11 , mcycle;"
+            "csrr t3  , minstret;"
+
+            "subw  %[c], s11 , s9;"
+            "subw  %[i], t3  , s10;"
+
+        : [c] "=r" (*cycle_count),[i] "=r" (*instr_count)
+        : 
+        : "zero","s4","s5","s6","s7","s8","s9","s10","s11","t3","t4","t5","t6","cc"
+
+    );
+
+}
diff --git a/apps/mem_test/mem_test_latency/make_bin.sh b/apps/mem_test/mem_test_latency/make_bin.sh
@@ -0,0 +1,4 @@
+for ((i=10;i<20;i++))
+do
+    make ARCH=riscv64-xs INST=$i
+done
diff --git a/apps/mem_test/mem_test_latency/mem_test_latency.c b/apps/mem_test/mem_test_latency/mem_test_latency.c
@@ -0,0 +1,24 @@
+#include <klib.h>
+#include "mem_test_latency.h"
+
+#define xstr(s) str(s)
+#define str(s)  #s
+
+int main(){
+    unsigned long long busy_cycles;
+    unsigned long long busy_instrs;
+    //control
+    uint64_t cache_warmup_size_count = atoi(xstr(INST));
+    //_PERF_TEST_ADDR_BASE 0x80010000
+    uint64_t start_addr = _PERF_TEST_ADDR_BASE;
+    //init end address, step = 256k,input step size
+    //_PERF_L1_SIZE_BYTE (64 * KB) _TEST_NUM_SIZE (32 * KB) _STEP_SIZE (256 *KB)
+    uint64_t end_addr = (_PERF_TEST_ADDR_BASE + _PERF_L2_SIZE_BYTE + cache_warmup_size_count *_STEP_SIZE);
+    //make cache warmup  put the test address in l1/l2/l3 _PERF_CACHELINE_SIZE_BYTE (8 * BYTE)
+    full_cache_warmup_i(start_addr,end_addr,_PERF_CACHELINE_SIZE_BYTE,cache_warmup_size_count);
+    //cache latency test
+    cache_iloop(&busy_instrs,&busy_cycles);
+
+    printf("the test of load-add-load hit result\n");
+    printf("instrs %d cycles %d\n",busy_instrs,busy_cycles);
+}