diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 5c411a0955878..7bf6a635158eb 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1519,6 +1519,9 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F64_e64: case AMDGPU::V_MAX_NUM_F64_e64: case AMDGPU::V_PK_MAX_F16: { + if (MI.mayRaiseFPException()) + return nullptr; + if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) return nullptr; @@ -1565,6 +1568,9 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { if (TII->getClampMask(*Def) != TII->getClampMask(MI)) return false; + if (Def->mayRaiseFPException()) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); if (!DefClamp) return false; @@ -1650,7 +1656,9 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 || Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 || Op == AMDGPU::V_MUL_F16_fake16_e64) && - MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) + MFI->getMode().FP64FP16Denormals.Output != + DenormalMode::PreserveSign) || + MI.mayRaiseFPException()) return std::pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1725,6 +1733,9 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) return false; + if (Def->mayRaiseFPException()) + return false; + // Clamp is applied after omod. If the source already has clamp set, don't // fold it. if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 30c27b6439fc0..cc1b9ac0c9ecd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3972,7 +3972,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src0) .add(*Src1) - .addImm(Imm); + .addImm(Imm) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -3991,7 +3992,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src0) .addImm(Imm) - .add(*Src2); + .add(*Src2) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -4012,7 +4014,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src1) .addImm(Imm) - .add(*Src2); + .add(*Src2) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -4048,7 +4051,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Src2Mods ? Src2Mods->getImm() : 0) .add(*Src2) .addImm(Clamp ? Clamp->getImm() : 0) - .addImm(Omod ? Omod->getImm() : 0); + .addImm(Omod ? Omod->getImm() : 0) + .setMIFlags(MI.getFlags()); if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) MIB.addImm(OpSel ? OpSel->getImm() : 0); updateLiveVariables(LV, MI, *MIB); diff --git a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir index d1ba62f1d87f6..761dd162df609 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir +++ b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -410,3 +410,91 @@ body: | %1 = V_MAX_F32_e64 0, killed %0, 0, 1056964608, 1, 0, implicit $mode, implicit $exec ... + +--- +# GCN-LABEL: name: clamp_missing_nofpexcept_0 +# GCN: %2:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec +# GCN-NEXT: %3:vgpr_32 = V_MAX_F32_e64 0, killed %2, 0, killed %2, 1, 0, implicit $mode, implicit $exec +name: clamp_missing_nofpexcept_0 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + fp32-input-denormals: false + fp32-output-denormals: false + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_MAX_F32_e64 0, killed %2, 0, killed %2, 1, 0, implicit $mode, implicit $exec +... + +--- +# GCN-LABEL: name: clamp_missing_nofpexcept_1 +# GCN: %2:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec +# GCN-NEXT: %3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, killed %2, 0, killed %2, 1, 0, implicit $mode, implicit $exec +name: clamp_missing_nofpexcept_1 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + fp32-input-denormals: false + fp32-output-denormals: false + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, killed %2, 0, killed %2, 1, 0, implicit $mode, implicit $exec +... + +--- +# GCN-LABEL: name: omod_missing_nofpexcept_0 +# GCN: %2:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec +# GCN-NEXT: %3:vgpr_32 = nsz V_MUL_F32_e64 0, killed %2, 0, 1056964608, 0, 0, implicit $mode, implicit $exec +name: omod_missing_nofpexcept_0 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = nofpexcept V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = nsz V_MUL_F32_e64 0, killed %2, 0, 1056964608, 0, 0, implicit $mode, implicit $exec + +... + +--- +# GCN-LABEL: name: omod_missing_nofpexcept_1 +# GCN: %2:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec +# GCN-NEXT: %3:vgpr_32 = nsz nofpexcept V_MUL_F32_e64 0, killed %2, 0, 1056964608, 0, 0, implicit $mode, implicit $exec +name: omod_missing_nofpexcept_1 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = nsz nofpexcept V_MUL_F32_e64 0, killed %2, 0, 1056964608, 0, 0, implicit $mode, implicit $exec + +...