From 3ae3059232aebab7ab84a22d8af600b3dbbed428 Mon Sep 17 00:00:00 2001 From: Yen-Fu Chen Date: Mon, 22 May 2023 14:22:23 +0800 Subject: [PATCH] Add fuse instruction To enhance execution efficiency, we employ instruction fusion by combining sequences that adhere to specific patterns into fused instructions. Currently, we have incorporated four fused instructions: auipc + addi, auipc + add, multiple sw, and multiple lw. --- src/decode.h | 15 ++++- src/emulate.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 157 insertions(+), 13 deletions(-) diff --git a/src/decode.h b/src/decode.h index ada1ccab6..83fbd0b3e 100644 --- a/src/decode.h +++ b/src/decode.h @@ -156,7 +156,12 @@ _(cjalr, 1) \ _(cadd, 0) \ _(cswsp, 0) \ - ) + ) \ + _(fuse1, 0) \ + _(fuse2, 0) \ + _(fuse3, 0) \ + _(fuse4, 0) \ + _(empty, 0) /* clang-format on */ /* IR list */ @@ -228,6 +233,11 @@ enum { INSN_32 = 4, }; +typedef struct mem_fuse { + int32_t imm; + uint8_t rd, rs1, rs2; +} mem_fuse_t; + typedef struct rv_insn { union { int32_t imm; @@ -240,6 +250,9 @@ typedef struct rv_insn { #if RV32_HAS(EXT_C) uint8_t shamt; #endif + /* fuse operation */ + int32_t imm2; + mem_fuse_t *mem_fuse; /* instruction length */ uint8_t insn_len; diff --git a/src/emulate.c b/src/emulate.c index c9015fb58..19cd2d270 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -287,18 +287,18 @@ enum { #define RVOP_RUN_NEXT (!ir->tailcall) #endif -#define RVOP(inst, code) \ - static bool do_##inst(riscv_t *rv UNUSED, const rv_insn_t *ir UNUSED) \ - { \ - rv->X[rv_reg_zero] = 0; \ - code; \ - rv->csr_cycle++; \ - nextop: \ - rv->PC += ir->insn_len; \ - if (!RVOP_RUN_NEXT) \ - return true; \ - const rv_insn_t *next = ir + 1; \ - MUST_TAIL return next->impl(rv, next); \ +#define RVOP(inst, code) \ + static bool do_##inst(riscv_t *rv, const rv_insn_t *ir) \ + { \ + rv->X[rv_reg_zero] = 0; \ + rv->csr_cycle++; \ + code; \ + nextop: \ + rv->PC += ir->insn_len; \ + if (!RVOP_RUN_NEXT) \ + return true; \ + const rv_insn_t *next = ir + 1; \ + MUST_TAIL return next->impl(rv, next); \ } /* RV32I Base Instruction Set */ @@ -1277,6 +1277,48 @@ RVOP(cswsp, { }) #endif +/* auipc + addi */ +RVOP(fuse1, { + rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2); + rv->PC += ir->insn_len; +}) + +/* auipc + add */ +RVOP(fuse2, { + rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm); + rv->PC += ir->insn_len; +}) + +/* multiple sw */ +RVOP(fuse3, { + mem_fuse_t *mem_fuse = ir->mem_fuse; + for (int i = 0; i < ir->imm2; i++) { + const uint32_t addr = rv->X[mem_fuse[i].rs1] + mem_fuse[i].imm; + RV_EXC_MISALIGN_HANDLER(3, store, false, 1); + rv->io.mem_write_w(rv, addr, rv->X[mem_fuse[i].rs2]); + } + rv->PC += ir->insn_len * (ir->imm2 - 1); +}) + +/* multiple lw */ +RVOP(fuse4, { + mem_fuse_t *mem_fuse = ir->mem_fuse; + for (int i = 0; i < ir->imm2; i++) { + const uint32_t addr = rv->X[mem_fuse[i].rs1] + mem_fuse[i].imm; + RV_EXC_MISALIGN_HANDLER(3, load, false, 1); + rv->X[mem_fuse[i].rd] = rv->io.mem_read_w(rv, addr); + } + rv->PC += ir->insn_len * (ir->imm2 - 1); +}) + +static bool do_empty(riscv_t *rv, const rv_insn_t *ir) +{ + rv->X[rv_reg_zero] = 0; + rv->csr_cycle++; + const rv_insn_t *next = ir + 1; + MUST_TAIL return next->impl(rv, next); +} + static const void *dispatch_table[] = { #define _(inst, can_branch) [rv_insn_##inst] = do_##inst, RISCV_INSN_LIST @@ -1407,6 +1449,92 @@ static void extend_block(riscv_t *rv, block_t *block) last_ir->branch_untaken = next->ir; } +static void match_pattern(block_t *block) +{ + for (uint32_t i = 0; i < block->n_insn - 1; i++) { + rv_insn_t *ir = block->ir + i, *next_ir = NULL; + int32_t count = 0; + switch (ir->opcode) { + case rv_insn_auipc: + next_ir = ir + 1; + if (next_ir->opcode == rv_insn_addi) { + if (ir->rd == next_ir->rs1) { + ir->opcode = rv_insn_fuse1; + ir->rd = next_ir->rd; + ir->imm2 = next_ir->imm; + ir->impl = dispatch_table[ir->opcode]; + next_ir->opcode = rv_insn_empty; + next_ir->impl = dispatch_table[next_ir->opcode]; + } else if (ir->rd == next_ir->rs2) { + ir->opcode = rv_insn_fuse2; + ir->rd = next_ir->rd; + ir->rs1 = next_ir->rs1; + ir->impl = dispatch_table[ir->opcode]; + next_ir->opcode = rv_insn_empty; + next_ir->impl = dispatch_table[next_ir->opcode]; + } + } + break; + case rv_insn_sw: + count = 1; + for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { + next_ir = ir + j; + if (next_ir->opcode != rv_insn_sw) + break; + count++; + } + if (count >= 5) { + ir->opcode = rv_insn_fuse3; + ir->mem_fuse = malloc(count * sizeof(mem_fuse_t)); + ir->imm2 = count; + ir->mem_fuse[0].imm = ir->imm; + ir->mem_fuse[0].rd = ir->rd; + ir->mem_fuse[0].rs1 = ir->rs1; + ir->mem_fuse[0].rs2 = ir->rs2; + ir->impl = dispatch_table[ir->opcode]; + for (int j = 1; j < count; j++) { + next_ir = ir + j; + ir->mem_fuse[j].imm = next_ir->imm; + ir->mem_fuse[j].rd = next_ir->rd; + ir->mem_fuse[j].rs1 = next_ir->rs1; + ir->mem_fuse[j].rs2 = next_ir->rs2; + next_ir->opcode = rv_insn_empty; + next_ir->impl = dispatch_table[next_ir->opcode]; + } + } + break; + case rv_insn_lw: + count = 1; + for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { + next_ir = ir + j; + if (next_ir->opcode != rv_insn_lw) + break; + count++; + } + if (count >= 5) { + ir->opcode = rv_insn_fuse4; + ir->mem_fuse = malloc(count * sizeof(mem_fuse_t)); + ir->imm2 = count; + ir->mem_fuse[0].imm = ir->imm; + ir->mem_fuse[0].rd = ir->rd; + ir->mem_fuse[0].rs1 = ir->rs1; + ir->mem_fuse[0].rs2 = ir->rs2; + ir->impl = dispatch_table[ir->opcode]; + for (int j = 1; j < count; j++) { + next_ir = ir + j; + ir->mem_fuse[j].imm = next_ir->imm; + ir->mem_fuse[j].rd = next_ir->rd; + ir->mem_fuse[j].rs1 = next_ir->rs1; + ir->mem_fuse[j].rs2 = next_ir->rs2; + next_ir->opcode = rv_insn_empty; + next_ir->impl = dispatch_table[next_ir->opcode]; + } + } + break; + } + } +} + static block_t *block_find_or_translate(riscv_t *rv, block_t *prev) { block_map_t *map = &rv->block_map; @@ -1425,6 +1553,9 @@ static block_t *block_find_or_translate(riscv_t *rv, block_t *prev) /* translate the basic block */ block_translate(rv, next); + /* fuse instruction */ + match_pattern(next); + /* insert the block into block map */ block_insert(&rv->block_map, next);