diff --git a/src/dynarec/rv64/dynarec_rv64_0f_vector.c b/src/dynarec/rv64/dynarec_rv64_0f_vector.c index 268023c973..e01ed34bae 100644 --- a/src/dynarec/rv64/dynarec_rv64_0f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_0f_vector.c @@ -104,7 +104,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, q0 = fpu_get_scratch(dyn); VSLIDEDOWN_VI(q0, v1, 1, VECTOR_UNMASKED); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v0, q0); // implies VMASK } else { VMV_X_S(x4, q0); @@ -116,7 +116,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); // unaligned! GETGX_vector(v0, 1, VECTOR_SEW8); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1); } break; @@ -140,7 +140,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); v1 = fpu_get_scratch(dyn); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1); VSLIDEUP_VI(v0, v1, 8, VECTOR_UNMASKED); } @@ -156,7 +156,7 @@ uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, q0 = fpu_get_scratch(dyn); VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v1, v1, q0); // implies VMASK } else { VMV_X_S(x4, q0); diff --git a/src/dynarec/rv64/dynarec_rv64_660f_vector.c b/src/dynarec/rv64/dynarec_rv64_660f_vector.c index b27f4135fb..3327adf874 100644 --- a/src/dynarec/rv64/dynarec_rv64_660f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_660f_vector.c @@ -99,7 +99,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i } else { q0 = fpu_get_scratch(dyn); VXOR_VV(q0, q0, q0, VECTOR_UNMASKED); - vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1); + VECTOR_LOAD_VMASK(0b10, x1, 1); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); VLUXEI64_V(v0, q0, ed, VECTOR_MASKED, VECTOR_NFIELD1); @@ -117,7 +117,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i q0 = fpu_get_scratch(dyn); VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v1, q0); // implies VMASK } else { if (v0 != v1) { VMV_V_V(v0, v1); } @@ -126,7 +126,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i } } else { q0 = fpu_get_scratch(dyn); - vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1); + VECTOR_LOAD_VMASK(0b10, x1, 1); VSLIDE1DOWN_VX(v0, v0, xZR, VECTOR_UNMASKED); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); @@ -197,7 +197,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); - VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); VMV_V_V(v0, q0); if (q1 & 1) VMV_V_V(d1, q1); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2); @@ -216,7 +215,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); - VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); VMV_V_V(v0, q0); if (q1 & 1) VMV_V_V(d1, q1); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW32, VECTOR_LMUL2, 2); @@ -236,7 +234,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); - VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); VMV_V_V(v0, q0); if (q1 & 1) VMV_V_V(d1, q1); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2); @@ -257,7 +254,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches! VWMULSU_VV(v0, q1, q0, VECTOR_UNMASKED); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2); - vector_loadmask(dyn, ninst, VMASK, 0b0101010101010101, x4, 2); + VECTOR_LOAD_VMASK(0b0101010101010101, x4, 2); VCOMPRESS_VM(d0, v0, VMASK); VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED); VCOMPRESS_VM(d1, v0, VMASK); @@ -273,7 +270,6 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); - VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); VMV_V_V(v0, q0); if (q1 & 1) VMV_V_V(d1, q1); vector_vsetvli(dyn, ninst, x1, VECTOR_SEW16, VECTOR_LMUL2, 2); @@ -671,7 +667,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i GETGX_vector(q0, 1, VECTOR_SEW16); GETEX_vector(q1, 0, 0, VECTOR_SEW16); u8 = F8; - vector_loadmask(dyn, ninst, VMASK, u8, x4, 1); + VECTOR_LOAD_VMASK(u8, x4, 1); VADD_VI(q0, q1, 0, VECTOR_MASKED); break; case 0x0F: @@ -911,7 +907,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i INST_NAME("PUNPCKLBW Gx, Ex"); nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); - vector_loadmask(dyn, ninst, VMASK, 0b1010101010101010, x1, 1); + VECTOR_LOAD_VMASK(0b1010101010101010, x1, 1); v0 = fpu_get_scratch(dyn); VIOTA_M(v0, VMASK, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 GETGX_vector(q0, 1, VECTOR_SEW8); @@ -926,7 +922,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i INST_NAME("PUNPCKLWD Gx, Ex"); nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); - vector_loadmask(dyn, ninst, VMASK, 0b10101010, x1, 1); + VECTOR_LOAD_VMASK(0b10101010, x1, 1); v0 = fpu_get_scratch(dyn); VIOTA_M(v0, VMASK, VECTOR_UNMASKED); // v0 = 3 3 2 2 1 1 0 0 GETGX_vector(q0, 1, VECTOR_SEW16); @@ -941,7 +937,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i INST_NAME("PUNPCKLDQ Gx, Ex"); nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); - vector_loadmask(dyn, ninst, VMASK, 0b1010, x1, 1); + VECTOR_LOAD_VMASK(0b1010, x1, 1); v0 = fpu_get_scratch(dyn); VIOTA_M(v0, VMASK, VECTOR_UNMASKED); // v0 = 1 1 0 0 GETGX_vector(q0, 1, VECTOR_SEW32); @@ -1021,7 +1017,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i INST_NAME("PUNPCKHBW Gx, Ex"); nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); - vector_loadmask(dyn, ninst, VMASK, 0b1010101010101010, x1, 1); + VECTOR_LOAD_VMASK(0b1010101010101010, x1, 1); v0 = fpu_get_scratch(dyn); VIOTA_M(v0, VMASK, VECTOR_UNMASKED); VADD_VI(v0, v0, 8, VECTOR_UNMASKED); // v0 = 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 @@ -1029,7 +1025,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i INST_NAME("PUNPCKHWD Gx, Ex"); nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW16, 1); - vector_loadmask(dyn, ninst, VMASK, 0b10101010, x1, 1); + VECTOR_LOAD_VMASK(0b10101010, x1, 1); v0 = fpu_get_scratch(dyn); VIOTA_M(v0, VMASK, VECTOR_UNMASKED); VADD_VI(v0, v0, 4, VECTOR_UNMASKED); // v0 = 7 7 6 6 5 5 4 4 @@ -1037,7 +1033,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i INST_NAME("PUNPCKHDQ Gx, Ex"); nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); - vector_loadmask(dyn, ninst, VMASK, 0b1010, x1, 1); + VECTOR_LOAD_VMASK(0b1010, x1, 1); v0 = fpu_get_scratch(dyn); VIOTA_M(v0, VMASK, VECTOR_UNMASKED); VADD_VI(v0, v0, 2, VECTOR_UNMASKED); // v0 = 3 3 2 2 @@ -1087,7 +1083,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i } else { q0 = fpu_get_scratch(dyn); VXOR_VV(q0, q0, q0, VECTOR_UNMASKED); - vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1); + VECTOR_LOAD_VMASK(0b10, x1, 1); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); VLUXEI64_V(v0, q0, ed, VECTOR_MASKED, VECTOR_NFIELD1); @@ -1105,7 +1101,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i q0 = fpu_get_scratch(dyn); VSLIDE1DOWN_VX(q0, v0, xZR, VECTOR_UNMASKED); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v1, q0); // implies VMASK } else { if (v0 != v1) { VMV_V_V(v0, v1); } @@ -1114,7 +1110,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i } } else { q0 = fpu_get_scratch(dyn); - vector_loadmask(dyn, ninst, VMASK, 0b10, x1, 1); + VECTOR_LOAD_VMASK(0b10, x1, 1); VSLIDE1DOWN_VX(v0, v0, xZR, VECTOR_UNMASKED); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); @@ -1132,7 +1128,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i SET_ELEMENT_WIDTH(x3, VECTOR_SEW32, 1); } VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); - vector_loadmask(dyn, ninst, VMASK, 1, x4, 1); + VECTOR_LOAD_VMASK(1, x4, 1); VMERGE_VXM(v0, v0, ed); break; case 0x6F: @@ -1389,6 +1385,37 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i VMERGE_VIM(q0, q0, 1); // implies vmask and widened it VRSUB_VX(q0, q0, xZR, VECTOR_UNMASKED); break; + case 0x7C: + INST_NAME("HADDPD Gx, Ex"); + nextop = F8; + SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); + GETGX_vector(q0, 1, VECTOR_SEW64); + GETEX_vector(q1, 0, 0, VECTOR_SEW64); + v0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + v1 = fpu_get_scratch(dyn); + d1 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); + d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); // no more scratches! + VMV_V_V(v0, q0); + if (q1 & 1) VMV_V_V(d1, q1); + VMV_V_I(VMASK, rv64_xtheadvector ? 1 : 0b0101); + vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL2, 2); + VSLIDEUP_VI(v0, (q1 & 1) ? d1 : q1, 2, VECTOR_UNMASKED); + VCOMPRESS_VM(d0, v0, VMASK); + VXOR_VI(VMASK, VMASK, 0x1F, VECTOR_UNMASKED); + VCOMPRESS_VM(d1, v0, VMASK); + vector_vsetvli(dyn, ninst, x1, VECTOR_SEW64, VECTOR_LMUL1, 1); + if (!box64_dynarec_fastnan) { + VMFEQ_VV(v0, d0, d0, VECTOR_UNMASKED); + VMFEQ_VV(v1, d1, d1, VECTOR_UNMASKED); + VMAND_MM(v0, v0, v1); + } + VFADD_VV(q0, d0, d1, VECTOR_UNMASKED); + if (!box64_dynarec_fastnan) { + VMFEQ_VV(v1, q0, q0, VECTOR_UNMASKED); + VMANDN_MM(VMASK, v0, v1); + VFSGNJN_VV(q0, q0, q0, VECTOR_MASKED); + } + break; case 0x7E: return 0; case 0x7F: @@ -1424,7 +1451,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i LHU(x4, ed, fixedaddress); ed = x4; } - vector_loadmask(dyn, ninst, VMASK, (1 << u8), x5, 1); + VECTOR_LOAD_VMASK((1 << u8), x5, 1); v0 = fpu_get_scratch(dyn); VMERGE_VXM(v0, q0, ed); // uses VMASK VMV_V_V(q0, v0); @@ -1470,7 +1497,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (MODREG) { q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64); } else { - vector_loadmask(dyn, ninst, VMASK, 1, x1, 1); + VECTOR_LOAD_VMASK(1, x1, 1); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); q1 = fpu_get_scratch(dyn); @@ -1624,7 +1651,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); GETGX_vector(q0, 1, VECTOR_SEW64); - vector_loadmask(dyn, ninst, VMASK, 1, x1, 1); + VECTOR_LOAD_VMASK(1, x1, 1); if (MODREG) { q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64); } else { @@ -1645,7 +1672,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i nextop = F8; SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); GETGX_vector(q0, 1, VECTOR_SEW64); - vector_loadmask(dyn, ninst, VMASK, 1, x1, 1); + VECTOR_LOAD_VMASK(1, x1, 1); if (MODREG) { q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64); } else { @@ -1789,7 +1816,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (MODREG) { q1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64); } else { - vector_loadmask(dyn, ninst, VMASK, 1, x1, 1); + VECTOR_LOAD_VMASK(1, x1, 1); SMREAD(); addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0); q1 = fpu_get_scratch(dyn); @@ -1857,7 +1884,7 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i VSRA_VI(v1, v0, 15, VECTOR_UNMASKED); VXOR_VV(v0, v1, v0, VECTOR_UNMASKED); VSUB_VV(v1, v0, v1, VECTOR_UNMASKED); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 2); + VECTOR_LOAD_VMASK(0xFF, x4, 2); VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); VREDSUM_VS(v0, v1, v0, VECTOR_MASKED); // sum low 64 VSLIDEDOWN_VI(d0, v1, 8, VECTOR_UNMASKED); diff --git a/src/dynarec/rv64/dynarec_rv64_f20f_vector.c b/src/dynarec/rv64/dynarec_rv64_f20f_vector.c index 263f30307e..120e128153 100644 --- a/src/dynarec/rv64/dynarec_rv64_f20f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_f20f_vector.c @@ -56,7 +56,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1, VECTOR_SEW64); v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0, VECTOR_SEW64); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v0, v1); // implies VMASK } else { VMV_X_S(x4, v1); @@ -68,7 +68,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd); d0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(d0, ed, VECTOR_MASKED, VECTOR_NFIELD1); VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); VMERGE_VVM(v0, v0, d0); // implies VMASK @@ -84,7 +84,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i ed = (nextop & 7) + (rex.b << 3); d0 = sse_get_reg_vector(dyn, ninst, x1, ed, 1, VECTOR_SEW64); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v0, v1); // implies VMASK } else { VMV_X_S(x4, v1); @@ -115,7 +115,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (rv64_xtheadvector) { v1 = fpu_get_scratch(dyn); VFMV_S_F(v1, v0); - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v0, v1); // implies VMASK } else { VFMV_S_F(v0, v0); @@ -133,7 +133,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); } @@ -168,7 +168,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v0, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); } @@ -209,13 +209,13 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); GETGX_vector(v0, 1, VECTOR_SEW64); } if (box64_dynarec_fastnan) { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VFMUL_VV(v0, v0, v1, VECTOR_MASKED); } else { VFMV_F_S(v0, v0); @@ -232,7 +232,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (rv64_xtheadvector) { d0 = fpu_get_scratch(dyn); VFMV_S_F(d0, v0); - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v0, d0); // implies VMASK } else { VFMV_S_F(v0, v0); @@ -251,7 +251,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); GETGX_vector(v0, 1, VECTOR_SEW64); @@ -271,13 +271,13 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (rv64_xtheadvector) { d0 = fpu_get_scratch(dyn); VFMV_S_F(d0, v0); - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(v0, v0, d0); // implies VMASK } else { VFMV_S_F(v0, v0); } } else { - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VFDIV_VV(v0, v0, v1, VECTOR_MASKED); } break; @@ -293,7 +293,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i d1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(d1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); GETGX_vector(d0, 1, VECTOR_SEW64); @@ -346,7 +346,7 @@ uintptr_t dynarec64_F20F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (rv64_xtheadvector) { v0 = fpu_get_scratch(dyn); VMV_S_X(v0, x2); - vector_loadmask(dyn, ninst, VMASK, 0b01, x4, 1); + VECTOR_LOAD_VMASK(0b01, x4, 1); VMERGE_VVM(d0, d0, v0); // implies VMASK } else { VMV_S_X(d0, x2); diff --git a/src/dynarec/rv64/dynarec_rv64_f30f_vector.c b/src/dynarec/rv64/dynarec_rv64_f30f_vector.c index 46c3db2dc0..3dda70b153 100644 --- a/src/dynarec/rv64/dynarec_rv64_f30f_vector.c +++ b/src/dynarec/rv64/dynarec_rv64_f30f_vector.c @@ -58,7 +58,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1, VECTOR_SEW32); v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0, VECTOR_SEW32); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VMERGE_VVM(v0, v0, v1); // implies VMASK } else { VMV_X_S(x4, v1); @@ -70,7 +70,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd); d0 = fpu_get_scratch(dyn); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xF, x4, 1); + VECTOR_LOAD_VMASK(0xF, x4, 1); VLE8_V(d0, ed, VECTOR_MASKED, VECTOR_NFIELD1); VXOR_VV(v0, v0, v0, VECTOR_UNMASKED); VMERGE_VVM(v0, v0, d0); // implies VMASK @@ -86,7 +86,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i ed = (nextop & 7) + (rex.b << 3); d0 = sse_get_reg_vector(dyn, ninst, x1, ed, 1, VECTOR_SEW32); if (rv64_xtheadvector) { - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VMERGE_VVM(v0, v0, v1); // implies VMASK } else { VMV_X_S(x4, v1); @@ -119,7 +119,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (rv64_xtheadvector) { v1 = fpu_get_scratch(dyn); VFMV_S_F(v1, v0); - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VMERGE_VVM(v0, v0, v1); // implies VMASK } else { VFMV_S_F(v0, v0); @@ -139,13 +139,13 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); GETGX_vector(v0, 1, VECTOR_SEW32); } if (box64_dynarec_fastnan) { - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VFMUL_VV(v0, v0, v1, VECTOR_MASKED); } else { VFMV_F_S(v0, v0); @@ -162,7 +162,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (rv64_xtheadvector) { d0 = fpu_get_scratch(dyn); VFMV_S_F(d0, v0); - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VMERGE_VVM(v0, v0, d0); // implies VMASK } else { VFMV_S_F(v0, v0); @@ -181,13 +181,13 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); GETGX_vector(v0, 1, VECTOR_SEW32); } d0 = fpu_get_scratch_lmul(dyn, VECTOR_LMUL2); - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VFWCVT_F_F_V(d0, v1, VECTOR_MASKED); SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1); if (rv64_xtheadvector) { @@ -209,7 +209,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); GETGX_vector(v0, 1, VECTOR_SEW32); @@ -218,7 +218,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i d1 = fpu_get_scratch(dyn); q0 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VMV_V_V(q1, VMASK); VMFEQ_VV(d0, v0, v0, VECTOR_MASKED); VMFEQ_VV(d1, v1, v1, VECTOR_MASKED); @@ -241,7 +241,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i v1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 0); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(v1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); GETGX_vector(v0, 1, VECTOR_SEW32); @@ -250,7 +250,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i d1 = fpu_get_scratch(dyn); q0 = fpu_get_scratch(dyn); q1 = fpu_get_scratch(dyn); - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VMV_V_V(q1, VMASK); VMFEQ_VV(d0, v0, v0, VECTOR_MASKED); VMFEQ_VV(d1, v1, v1, VECTOR_MASKED); @@ -278,7 +278,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i d1 = fpu_get_scratch(dyn); SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 0, 1); - vector_loadmask(dyn, ninst, VMASK, 0xFF, x4, 1); + VECTOR_LOAD_VMASK(0xFF, x4, 1); VLE8_V(d1, ed, VECTOR_MASKED, VECTOR_NFIELD1); SET_ELEMENT_WIDTH(x1, VECTOR_SEW32, 1); GETGX_vector(d0, 1, VECTOR_SEW32); @@ -331,7 +331,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i if (rv64_xtheadvector) { v0 = fpu_get_scratch(dyn); VMV_S_X(v0, x2); - vector_loadmask(dyn, ninst, VMASK, 0b0001, x4, 1); + VECTOR_LOAD_VMASK(0b0001, x4, 1); VMERGE_VVM(d0, d0, v0); // implies VMASK } else { VMV_S_X(d0, x2); diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index eb239e5cd7..ea62bc69be 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -2711,6 +2711,11 @@ void vector_loadmask(dynarec_rv64_t* dyn, int ninst, int vreg, uint64_t imm, int ADDI(s1, xZR, 1); VMV_S_X(vreg, s1); return; + case 0b0101: + vector_vsetvli(dyn, ninst, s1, VECTOR_SEW64, VECTOR_LMUL1, 1); + VMV_V_I(vreg, 1); + vector_vsetvli(dyn, ninst, s1, sew, vlmul, multiple); + return; case 0b1010: vector_vsetvli(dyn, ninst, s1, VECTOR_SEW64, VECTOR_LMUL1, 1); MOV64x(s1, 0x100000000ULL); diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 12dd594b4d..d2d84e9074 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -1828,4 +1828,7 @@ uintptr_t dynarec64_F30F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i } \ } while (0) +#define VECTOR_LOAD_VMASK(mask, s1, multiple) \ + vector_loadmask(dyn, ninst, VMASK, mask, s1, multiple) + #endif //__DYNAREC_RV64_HELPER_H__