Skip to content

Commit

Permalink
Introduce macro operation fusion
Browse files Browse the repository at this point in the history
Through our observations, we have identified certain patterns in instruction
sequences. By converting these specific RISC-V instruction patterns into
faster and equivalent code, we can significantly improve execution efficiency.

In our current analysis, we focus on a commonly used benchmark and have
found the following frequently occurring instruction patterns: auipc + addi,
auipc + add, multiple sw, and multiple lw.

|  Metric  |     commit fba5802       |    macro fuse operation   |Speedup|
|----------+--------------------------+---------------------------+-------|
| CoreMark | 1351.065 (Iterations/Sec)|  1352.843 (Iterations/Sec)|+0.13% |
| dhrystone|       1073 DMIPS         |        1146 DMIPS         | +6.8% |
| nqueens  |       8295 msec          |        7824 msec          | +6.0% |
  • Loading branch information
qwe661234 committed May 27, 2023
1 parent fba5802 commit 3f84ce5
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 2 deletions.
18 changes: 17 additions & 1 deletion src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,15 @@
_(cjalr, 1) \
_(cadd, 0) \
_(cswsp, 0) \
)
) \
/* macro operation fusion: convert specific RISC-V instruction patterns
* into faster and equivalent code
*/ \
_(fuse1, 0) \
_(fuse2, 0) \
_(fuse3, 0) \
_(fuse4, 0) \
_(empty, 0)
/* clang-format on */

/* IR list */
Expand Down Expand Up @@ -228,6 +236,11 @@ enum {
INSN_32 = 4,
};

typedef struct {
int32_t imm;
uint8_t rd, rs1, rs2;
} opcode_fuse_t;

typedef struct rv_insn {
union {
int32_t imm;
Expand All @@ -240,6 +253,9 @@ typedef struct rv_insn {
#if RV32_HAS(EXT_C)
uint8_t shamt;
#endif
/* fuse operation */
int32_t imm2;
opcode_fuse_t *fuse;

/* instruction length */
uint8_t insn_len;
Expand Down
131 changes: 130 additions & 1 deletion src/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ extern struct target_ops gdbstub_ops;
#include "decode.h"
#include "riscv.h"
#include "riscv_private.h"
#include "state.h"
#include "utils.h"

/* RISC-V exception code list */
Expand Down Expand Up @@ -1219,6 +1220,60 @@ RVOP(cswsp, {
})
#endif

/* auipc + addi */
RVOP(fuse1, {
rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2);
rv->PC += ir->insn_len;
})

/* auipc + add */
RVOP(fuse2, {
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm);
rv->PC += ir->insn_len;
})

/* multiple sw */
RVOP(fuse3, {
opcode_fuse_t *fuse = ir->fuse;
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
/* the memory addresses of the sw instructions are contiguous, so we only
* need to check the first sw instruction to determine if its memory address
* is misaligned or if the memory chunk does not exist.
*/
RV_EXC_MISALIGN_HANDLER(3, store, false, 1);
rv->io.mem_write_w(rv, addr, rv->X[fuse[0].rs2]);
for (int i = 1; i < ir->imm2; i++) {
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
rv->io.mem_write_w(rv, addr, rv->X[fuse[i].rs2]);
}
rv->PC += ir->insn_len * (ir->imm2 - 1);
})

/* multiple lw */
RVOP(fuse4, {
opcode_fuse_t *fuse = ir->fuse;
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
/* the memory addresses of the lw instructions are contiguous, so we only
* need to check the first lw instruction to determine if its memory address
* is misaligned or if the memory chunk does not exist.
*/
RV_EXC_MISALIGN_HANDLER(3, load, false, 1);
rv->X[fuse[0].rd] = rv->io.mem_read_w(rv, addr);
for (int i = 1; i < ir->imm2; i++) {
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
rv->X[fuse[i].rd] = rv->io.mem_read_w(rv, addr);
}
rv->PC += ir->insn_len * (ir->imm2 - 1);
})

static bool do_empty(riscv_t *rv, const rv_insn_t *ir)
{
rv->X[rv_reg_zero] = 0;
rv->csr_cycle++;
const rv_insn_t *next = ir + 1;
MUST_TAIL return next->impl(rv, next);
}

static const void *dispatch_table[] = {
#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
RISCV_INSN_LIST
Expand Down Expand Up @@ -1337,7 +1392,6 @@ static void block_translate(riscv_t *rv, block_t *block)
/* compute the end of pc */
block->pc_end += ir->insn_len;
block->n_insn++;

/* stop on branch */
if (insn_is_branch(ir->opcode)) {
/* recursive jump translation */
Expand All @@ -1356,6 +1410,78 @@ static void block_translate(riscv_t *rv, block_t *block)
block->ir[block->n_insn - 1].tailcall = true;
}

#define pack_memory_operation(RW) \
count = 1; \
next_ir = ir + 1; \
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \
break; \
sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \
next_ir = ir + j; \
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \
ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \
break; \
count++; \
} \
if (count > 1) { \
ir->opcode = IIF(RW)(rv_insn_fuse4, rv_insn_fuse3); \
ir->fuse = malloc(count * sizeof(opcode_fuse_t)); \
ir->imm2 = count; \
memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \
ir->impl = dispatch_table[ir->opcode]; \
for (int j = 1; j < count; j++) { \
next_ir = ir + j; \
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
next_ir->opcode = rv_insn_empty; \
next_ir->impl = dispatch_table[next_ir->opcode]; \
} \
} \
break;


/* examine whether instructions in a block match a specific pattern. If so,
* rewrite them into fused instructions. */
static void match_pattern(block_t *block)
{
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
int32_t count = 0, sign = 1;
switch (ir->opcode) {
case rv_insn_auipc:
next_ir = ir + 1;
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
/* the destination register of instruction auipc is equal to the
* source register 1 of next instruction addi */
ir->opcode = rv_insn_fuse1;
ir->rd = next_ir->rd;
ir->imm2 = next_ir->imm;
ir->impl = dispatch_table[ir->opcode];
next_ir->opcode = rv_insn_empty;
next_ir->impl = dispatch_table[next_ir->opcode];
} else if (next_ir->opcode == rv_insn_add &&
ir->rd == next_ir->rs2) {
/* the destination register of instruction auipc is equal to the
* source register 2 of next instruction add */
ir->opcode = rv_insn_fuse2;
ir->rd = next_ir->rd;
ir->rs1 = next_ir->rs1;
ir->impl = dispatch_table[ir->opcode];
next_ir->opcode = rv_insn_empty;
next_ir->impl = dispatch_table[next_ir->opcode];
}
break;
case rv_insn_sw:
/* If the memory addresses of a sequence of store instructions for
* data are contiguous, pack these instructions. */
pack_memory_operation(0);
case rv_insn_lw:
/* If the memory addresses of a sequence of load instructions for
* data are contiguous, pack these instructions. */
pack_memory_operation(1);
}
}
}

static block_t *prev = NULL;
static block_t *block_find_or_translate(riscv_t *rv)
{
Expand All @@ -1375,6 +1501,9 @@ static block_t *block_find_or_translate(riscv_t *rv)
/* translate the basic block */
block_translate(rv, next);

/* macro operation fusion */
match_pattern(next);

/* insert the block into block map */
block_insert(&rv->block_map, next);

Expand Down
4 changes: 4 additions & 0 deletions src/riscv.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ void block_map_clear(block_map_t *map)
block_t *block = map->map[i];
if (!block)
continue;
for (uint32_t i = 0; i < block->n_insn; i++) {
if (block->ir[i].fuse)
free(block->ir[i].fuse);
}
free(block->ir);
free(block);
map->map[i] = NULL;
Expand Down

0 comments on commit 3f84ce5

Please sign in to comment.