diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1ebfa297f4fc3..88bc2c7566a90 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -104,6 +104,7 @@ class SIFoldOperands : public MachineFunctionPass { bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; bool tryFoldFoldableCopy(MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const; + bool tryFoldUniformReadFirstLaneCndMask(MachineInstr &MI) const; const MachineOperand *isClamp(const MachineInstr &MI) const; bool tryFoldClamp(MachineInstr &MI); @@ -1400,6 +1401,81 @@ bool SIFoldOperands::tryFoldFoldableCopy( return Changed; } +// Try to fold the following pattern: +// s_cselect s[2:3], -1, 0 +// v_cndmask v0, 0, +-1, s[2:3] +// v_readfirstlane s0, v0 +// +// into +// +// s_cselect s0, +-1, 0 +bool SIFoldOperands::tryFoldUniformReadFirstLaneCndMask( + MachineInstr &MI) const { + if (MI.getOpcode() != AMDGPU::V_READFIRSTLANE_B32) + return false; + + MachineInstr *RFLSrc = MRI->getVRegDef(MI.getOperand(1).getReg()); + if (!RFLSrc) + return false; + + // We can also have the following pattern: + // + // %2:vreg_64 = REG_SEQUENCE %X:vgpr_32, sub0, %1:sreg_32, sub1 + // %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64 + // + // In this case we dig into %X or %Y depending on which sub register + // the V_READFIRSTLANE accesses. + if (RFLSrc->isRegSequence()) { + unsigned RFLSubReg = MI.getOperand(1).getSubReg(); + if (RFLSrc->getNumOperands() != 5) + return false; + + if (RFLSrc->getOperand(2).getImm() == RFLSubReg) + RFLSrc = MRI->getVRegDef(RFLSrc->getOperand(1).getReg()); + else if (RFLSrc->getOperand(4).getImm() == RFLSubReg) + RFLSrc = MRI->getVRegDef(RFLSrc->getOperand(3).getReg()); + else + return false; + } + + // Need e64 to have a SGPR regmask. + if (!RFLSrc || RFLSrc->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src1); + Register Src2 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src2)->getReg(); + + if (!Src0->isImm() || Src0->getImm() != 0 || !Src1->isImm()) + return false; + + bool IsSigned = (Src1->getImm() == -1); + if (Src1->getImm() != 1 && !IsSigned) + return false; + + MachineInstr *CSel = MRI->getVRegDef(Src2); + if (!CSel || (CSel->getOpcode() != AMDGPU::S_CSELECT_B32 && + CSel->getOpcode() != AMDGPU::S_CSELECT_B64)) + return false; + + MachineOperand *CSelSrc0 = TII->getNamedOperand(*CSel, AMDGPU::OpName::src0); + MachineOperand *CSelSrc1 = TII->getNamedOperand(*CSel, AMDGPU::OpName::src1); + // Note: we could also allow any non-zero value for CSelSrc0, and adapt the + // BFE's mask depending on where the first set bit is. + if (!CSelSrc0->isImm() || CSelSrc0->getImm() != -1 || !CSelSrc1->isImm() || + CSelSrc1->getImm() != 0) + return false; + + // Build a S_CSELECT right before the old one so we're sure it uses the same + // SCC def. + BuildMI(*CSel->getParent(), *CSel, MI.getDebugLoc(), + TII->get(AMDGPU::S_CSELECT_B32), MI.getOperand(0).getReg()) + .addImm(IsSigned ? -1 : 1) + .addImm(0); + MI.eraseFromParent(); + return true; +} + // Clamp patterns are canonically selected to v_max_* instructions, so only // handle them. const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { @@ -2087,6 +2163,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { continue; } + if (tryFoldUniformReadFirstLaneCndMask(MI)) { + Changed = true; + continue; + } + // Saw an unknown clobber of m0, so we no longer know what it is. if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) CurrentKnownM0Val = nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 667c561ea26f6..140e74f40052a 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1532,15 +1532,13 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s3, 8 ; SI-NEXT: s_and_b32 s5, s3, 0x1ff -; SI-NEXT: s_and_b32 s6, s4, 0xffe +; SI-NEXT: s_and_b32 s4, s4, 0xffe ; SI-NEXT: s_or_b32 s2, s5, s2 ; SI-NEXT: s_cmp_lg_u32 s2, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; SI-NEXT: v_readfirstlane_b32 s2, v1 +; SI-NEXT: s_cselect_b32 s2, 1, 0 ; SI-NEXT: s_bfe_u32 s5, s3, 0xb0014 -; SI-NEXT: s_or_b32 s2, s6, s2 ; SI-NEXT: s_sub_i32 s6, 0x3f1, s5 +; SI-NEXT: s_or_b32 s2, s4, s2 ; SI-NEXT: v_med3_i32 v1, s6, 0, 13 ; SI-NEXT: s_or_b32 s4, s2, 0x1000 ; SI-NEXT: v_readfirstlane_b32 s6, v1 @@ -1595,15 +1593,13 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s7, 8 ; VI-NEXT: s_and_b32 s1, s7, 0x1ff -; VI-NEXT: s_and_b32 s2, s0, 0xffe -; VI-NEXT: s_or_b32 s0, s1, s6 -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_readfirstlane_b32 s0, v2 -; VI-NEXT: s_bfe_u32 s1, s7, 0xb0014 +; VI-NEXT: s_and_b32 s0, s0, 0xffe +; VI-NEXT: s_or_b32 s1, s1, s6 +; VI-NEXT: s_cmp_lg_u32 s1, 0 +; VI-NEXT: s_cselect_b32 s1, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_or_b32 s4, s2, s0 +; VI-NEXT: s_or_b32 s4, s0, s1 +; VI-NEXT: s_bfe_u32 s1, s7, 0xb0014 ; VI-NEXT: s_sub_i32 s2, 0x3f1, s1 ; VI-NEXT: v_med3_i32 v2, s2, 0, 13 ; VI-NEXT: s_or_b32 s0, s4, 0x1000 @@ -1657,14 +1653,12 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: s_and_b32 s1, s7, 0x1ff -; GFX9-NEXT: s_and_b32 s2, s0, 0xffe -; GFX9-NEXT: s_or_b32 s0, s1, s6 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffe +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 +; GFX9-NEXT: s_or_b32 s6, s0, s1 ; GFX9-NEXT: s_bfe_u32 s1, s7, 0xb0014 -; GFX9-NEXT: s_or_b32 s6, s2, s0 ; GFX9-NEXT: s_sub_i32 s2, 0x3f1, s1 ; GFX9-NEXT: v_med3_i32 v1, s2, 0, 13 ; GFX9-NEXT: s_or_b32 s0, s6, 0x1000 @@ -1714,55 +1708,52 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff ; GFX11-NEXT: s_lshr_b32 s2, s7, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s6 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffe ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX11-NEXT: s_bfe_u32 s1, s7, 0xb0014 -; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s1 -; GFX11-NEXT: s_addk_i32 s1, 0xfc10 -; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: s_lshl_b32 s8, s1, 12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s6, v1 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_or_b32 s3, s2, 0x1000 -; GFX11-NEXT: s_or_b32 s8, s2, s8 -; GFX11-NEXT: s_lshr_b32 s6, s3, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s3, v0 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_sub_i32 s6, 0x3f1, s3 +; GFX11-NEXT: s_or_b32 s2, s1, 0x1000 +; GFX11-NEXT: v_med3_i32 v0, s6, 0, 13 +; GFX11-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-NEXT: v_readfirstlane_b32 s6, v0 +; GFX11-NEXT: s_or_b32 s8, s1, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s6, s2, s6 +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: s_or_b32 s3, s6, s3 -; GFX11-NEXT: s_cmp_lt_i32 s1, 1 -; GFX11-NEXT: s_cselect_b32 s3, s3, s8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s6, s3, 7 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s6, s2 +; GFX11-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-NEXT: s_cselect_b32 s2, s2, s8 +; GFX11-NEXT: s_and_b32 s6, s2, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_gt_i32 s6, 5 ; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-NEXT: s_lshr_b32 s2, s2, 2 ; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s6, 0 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmp_lt_i32 s1, 31 -; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_addc_u32 s2, s2, 0 +; GFX11-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_lshr_b32 s1, s7, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1770,7 +1761,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 97216b6c94693..aca9f21316987 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -107,46 +107,44 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_lshr_b32 s4, s7, 8 ; SI-NEXT: s_and_b32 s5, s7, 0x1ff -; SI-NEXT: s_and_b32 s8, s4, 0xffe -; SI-NEXT: s_or_b32 s4, s5, s6 -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: s_bfe_u32 s4, s7, 0xb0014 -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: s_sub_i32 s6, 0x3f1, s4 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc10 -; SI-NEXT: s_or_b32 s11, s8, s5 -; SI-NEXT: v_med3_i32 v0, s6, 0, 13 -; SI-NEXT: s_lshl_b32 s4, s10, 12 -; SI-NEXT: s_or_b32 s5, s11, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: s_or_b32 s4, s11, s4 -; SI-NEXT: s_lshr_b32 s6, s5, s6 -; SI-NEXT: v_lshl_b32_e32 v0, s6, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffe +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_bfe_u32 s6, s7, 0xb0014 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_sub_i32 s4, 0x3f1, s6 +; SI-NEXT: s_addk_i32 s6, 0xfc10 +; SI-NEXT: s_or_b32 s5, s10, 0x1000 +; SI-NEXT: v_med3_i32 v0, s4, 0, 13 +; SI-NEXT: s_lshl_b32 s4, s6, 12 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_or_b32 s4, s10, s4 +; SI-NEXT: s_lshr_b32 s8, s5, s8 +; SI-NEXT: v_lshl_b32_e32 v0, s8, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, s5, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_cmp_lt_i32 s10, 1 -; SI-NEXT: s_cselect_b32 s6, s5, s4 -; SI-NEXT: s_and_b32 s8, s6, 7 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_cmp_lt_i32 s6, 1 +; SI-NEXT: s_cselect_b32 s11, s5, s4 +; SI-NEXT: s_and_b32 s8, s11, 7 ; SI-NEXT: s_cmp_gt_i32 s8, 5 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_lshr_b32 s11, s11, 2 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_addc_u32 s4, s6, 0 -; SI-NEXT: s_cmp_lt_i32 s10, 31 -; SI-NEXT: s_cselect_b32 s6, s4, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s11, 0 +; SI-NEXT: s_addc_u32 s4, s11, 0 +; SI-NEXT: s_cmp_lt_i32 s6, 31 +; SI-NEXT: s_cselect_b32 s8, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s10, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -164,17 +162,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe -; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8 +; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s7, 0x1ff +; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s4, 0xffe +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s5, s6 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s5, 0 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; VI-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s5 ; VI-SAFE-SDAG-NEXT: s_bfe_u32 s5, s7, 0xb0014 -; VI-SAFE-SDAG-NEXT: s_or_b32 s6, s8, s4 ; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s5 ; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 ; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s6, 0x1000 @@ -298,48 +294,46 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe ; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12 -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6 -; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6 -; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s5, s3, 0xb0014 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s6, 0x3f1, s5 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s2, 0x1000 +; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v0, s6, 0, 13 +; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s5, 0xfc10 +; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s7, s5, 12 +; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v0 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s7, s2, s7 +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s6, s4, s6 +; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v0, s6 +; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 ; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 +; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s6, s4 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s5, 1 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s7 +; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s4, 7 ; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 ; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s4, s4, 2 ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s4, s4, 0 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s5, 31 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s5, 0x40f +; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16 ; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo +; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo ; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -427,48 +421,45 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe ; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s5, s3, 0xb0014 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s6, 0x3f1, s5 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s2, 0x1000 +; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v0, s6, 0, 13 +; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s5, 0xfc10 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6 -; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 +; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s5, 12 +; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v0 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s2, s7 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s4, s6 +; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v0, s6 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 ; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 +; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s6, s4 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s5, 1 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s7 +; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s4, 7 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 ; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s4, s4, 2 ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s5, 31 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s5, 0x40f +; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16 ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -476,7 +467,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo +; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w32.mir b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w32.mir new file mode 100644 index 0000000000000..37e39c07f5ef4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w32.mir @@ -0,0 +1,207 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s + +--- +name: unsigned_32bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: unsigned_32bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: signed_32bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: signed_32bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: unsigned_64bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: unsigned_64bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + %dst:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec +... + +--- +name: signed_64bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: signed_64bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + %dst:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec +... + +--- +name: unsigned_64bits_double +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: unsigned_64bits_double + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst0:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: %dst1:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1 + %dst0:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec + %dst1:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +... + +--- +name: signed_64bits_double +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: signed_64bits_double + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst0:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: %dst1:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1 + %dst0:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec + %dst1:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +... + +--- +name: bad_subreg_64bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_subreg_64bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + %4:sreg_32 = V_READFIRSTLANE_B32 %3.sub1, implicit $exec +... + +--- +name: bad_select_imm +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_select_imm + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 1, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: %dst:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 1, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: bad_select_imm_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_select_imm_2 + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: %dst:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec + %0:sreg_32_xm0_xexec = S_CSELECT_B32 0, 1, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: bad_cndmask_imm +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_cndmask_imm + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + + +--- +name: bad_cndmask_imm_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_cndmask_imm_2 + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, [[S_CSELECT_B32_]], implicit $exec + ; GCN-NEXT: %dst:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec + %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w64.mir b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w64.mir new file mode 100644 index 0000000000000..fe90e9f51f057 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w64.mir @@ -0,0 +1,207 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize64" -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s + +--- +name: unsigned_32bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: unsigned_32bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: signed_32bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: signed_32bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: unsigned_64bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: unsigned_64bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + %dst:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec +... + +--- +name: signed_64bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: signed_64bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + %dst:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec +... + +--- +name: unsigned_64bits_double +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: unsigned_64bits_double + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst1:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: %dst0:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1 + %dst1:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec + %dst0:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +... + +--- +name: signed_64bits_double +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: signed_64bits_double + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %dst0:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: %dst1:sreg_32 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1 + %dst0:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec + %dst1:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +... + +--- +name: bad_subreg_64bits +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_subreg_64bits + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + %dst:sreg_32 = V_READFIRSTLANE_B32 %3.sub1, implicit $exec +... + +--- +name: bad_select_imm +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_select_imm + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 1, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: %dst:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec + %0:sreg_64_xexec = S_CSELECT_B64 -1, 1, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: bad_select_imm_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_select_imm_2 + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 0, 1, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: %dst:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec + %0:sreg_64_xexec = S_CSELECT_B64 0, 1, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +--- +name: bad_cndmask_imm +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_cndmask_imm + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 1, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +... + + +--- +name: bad_cndmask_imm_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + ; GCN-LABEL: name: bad_cndmask_imm_2 + ; GCN: liveins: $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, [[S_CSELECT_B64_]], implicit $exec + ; GCN-NEXT: %dst:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec + %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, %0, implicit $exec + %dst:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec +...