diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 23728636498ba..9e8fc5d635c50 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -696,10 +696,6 @@ class CombinerHelper { /// (G_*MULO x, 0) -> 0 + no carry out bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Match: - /// (G_*ADDO x, 0) -> x + no carry out - bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Match: /// (G_*ADDE x, y, 0) -> (G_*ADDO x, y) /// (G_*SUBE x, y, 0) -> (G_*SUBO x, y) @@ -810,12 +806,15 @@ class CombinerHelper { /// Combine selects. bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Combine ands, + /// Combine ands. bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Combine ors, + /// Combine ors. bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Combine addos. + bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; @@ -919,6 +918,7 @@ class CombinerHelper { bool isZeroOrZeroSplat(Register Src, bool AllowUndefs); bool isConstantSplatVector(Register Src, int64_t SplatValue, bool AllowUndefs); + bool isConstantOrConstantVectorI(Register Src) const; std::optional getConstantOrConstantSplatVector(Register Src); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index f5a6528d10a97..6b03703192df9 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr { Register getCarryOutReg() const { return getReg(1); } MachineOperand &getLHS() { return getOperand(2); } MachineOperand &getRHS() { return getOperand(3); } + Register getLHSReg() const { return getOperand(2).getReg(); } + Register getRHSReg() const { return getOperand(3).getReg(); } static bool classof(const MachineInstr *MI) { switch (MI->getOpcode()) { @@ -429,6 +431,23 @@ class GAddSubCarryOut : public GBinOpCarryOut { } }; +/// Represents overflowing add operations. +/// G_UADDO, G_SADDO +class GAddCarryOut : public GBinOpCarryOut { +public: + bool isSigned() const { return getOpcode() == TargetOpcode::G_SADDO; } + + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_UADDO: + case TargetOpcode::G_SADDO: + return true; + default: + return false; + } + } +}; + /// Represents overflowing add/sub operations that also consume a carry-in. /// G_UADDE, G_SADDE, G_USUBE, G_SSUBE class GAddSubCarryInOut : public GAddSubCarryOut { diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 9f18a5b856009..6980cbd04aeb1 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1090,12 +1090,6 @@ def mulo_by_0: GICombineRule< [{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; -def addo_by_0: GICombineRule< - (defs root:$root, build_fn_matchinfo:$matchinfo), - (match (wip_match_opcode G_UADDO, G_SADDO):$root, - [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]), - (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; - // Transform (uadde x, y, 0) -> (uaddo x, y) // (sadde x, y, 0) -> (saddo x, y) // (usube x, y, 0) -> (usubo x, y) @@ -1291,6 +1285,12 @@ def match_ors : GICombineRule< [{ return Helper.matchOr(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +def match_addos : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SADDO, G_UADDO):$root, + [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + // Combines concat operations def concat_matchinfo : GIDefMatchData<"SmallVector">; def combine_concat_vector : GICombineRule< @@ -1326,7 +1326,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p, overlapping_and, mulo_by_2, mulo_by_0, - addo_by_0, adde_to_addo, + adde_to_addo, combine_minmax_nan]>; def known_bits_simplifications : GICombineGroup<[ @@ -1374,7 +1374,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines, and_or_disjoint_mask, fma_combines, fold_binop_into_select, sub_add_reg, select_to_minmax, redundant_binop_in_equality, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, - combine_concat_vector, double_icmp_zero_and_or_combine]>; + combine_concat_vector, double_icmp_zero_and_or_combine, match_addos]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 2e706b4680193..bee49dbd0f838 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4936,24 +4936,6 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) { return true; } -bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) { - // (G_*ADDO x, 0) -> x + no carry out - assert(MI.getOpcode() == TargetOpcode::G_UADDO || - MI.getOpcode() == TargetOpcode::G_SADDO); - if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0))) - return false; - Register Carry = MI.getOperand(1).getReg(); - if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry))) - return false; - Register Dst = MI.getOperand(0).getReg(); - Register LHS = MI.getOperand(2).getReg(); - MatchInfo = [=](MachineIRBuilder &B) { - B.buildCopy(Dst, LHS); - B.buildConstant(Carry, 0); - }; - return true; -} - bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) { // (G_*ADDE x, y, 0) -> (G_*ADDO x, y) // (G_*SUBE x, y, 0) -> (G_*SUBO x, y) @@ -6354,6 +6336,26 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) { return Value; } +// FIXME G_SPLAT_VECTOR +bool CombinerHelper::isConstantOrConstantVectorI(Register Src) const { + auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI); + if (IConstant) + return true; + + GBuildVector *BuildVector = getOpcodeDef(Src, MRI); + if (!BuildVector) + return false; + + unsigned NumSources = BuildVector->getNumSources(); + for (unsigned I = 0; I < NumSources; ++I) { + std::optional IConstant = + getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI); + if (!IConstant) + return false; + } + return true; +} + // TODO: use knownbits to determine zeros bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo) { @@ -6928,3 +6930,178 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) { return false; } + +bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) { + GAddCarryOut *Add = cast(&MI); + + // Addo has no flags + Register Dst = Add->getReg(0); + Register Carry = Add->getReg(1); + Register LHS = Add->getLHSReg(); + Register RHS = Add->getRHSReg(); + bool IsSigned = Add->isSigned(); + LLT DstTy = MRI.getType(Dst); + LLT CarryTy = MRI.getType(Carry); + + // We want do fold the [u|s]addo. + if (!MRI.hasOneNonDBGUse(Dst)) + return false; + + // Fold addo, if the carry is dead -> add, undef. + if (MRI.use_nodbg_empty(Carry) && + isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}})) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildAdd(Dst, LHS, RHS); + B.buildUndef(Carry); + }; + return true; + } + + // We want do fold the [u|s]addo. + if (!MRI.hasOneNonDBGUse(Carry)) + return false; + + // Canonicalize constant to RHS. + if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) { + if (IsSigned) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSAddo(Dst, Carry, RHS, LHS); + }; + return true; + } + // !IsSigned + MatchInfo = [=](MachineIRBuilder &B) { + B.buildUAddo(Dst, Carry, RHS, LHS); + }; + return true; + } + + std::optional MaybeLHS = getConstantOrConstantSplatVector(LHS); + std::optional MaybeRHS = getConstantOrConstantSplatVector(RHS); + + // Fold addo(c1, c2) -> c3, carry. + if (MaybeLHS && MaybeRHS && isConstantLegalOrBeforeLegalizer(DstTy) && + isConstantLegalOrBeforeLegalizer(CarryTy)) { + bool Overflow; + APInt Result = IsSigned ? MaybeLHS->sadd_ov(*MaybeRHS, Overflow) + : MaybeLHS->uadd_ov(*MaybeRHS, Overflow); + MatchInfo = [=](MachineIRBuilder &B) { + B.buildConstant(Dst, Result); + B.buildConstant(Carry, Overflow); + }; + return true; + } + + // Fold (addo x, 0) -> x, no borrow + if (MaybeRHS && *MaybeRHS == 0 && isConstantLegalOrBeforeLegalizer(CarryTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildCopy(Dst, LHS); + B.buildConstant(Carry, 0); + }; + return true; + } + + // Given 2 constant operands whose sum does not overflow: + // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 + // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 + GAdd *AddLHS = getOpcodeDef(LHS, MRI); + if (MaybeRHS && AddLHS && MRI.hasOneNonDBGUse(Add->getReg(0)) && + ((IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoSWrap)) || + (!IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoUWrap)))) { + std::optional MaybeAddRHS = + getConstantOrConstantSplatVector(AddLHS->getRHSReg()); + if (MaybeAddRHS) { + bool Overflow; + APInt NewC = IsSigned ? MaybeAddRHS->sadd_ov(*MaybeRHS, Overflow) + : MaybeAddRHS->uadd_ov(*MaybeRHS, Overflow); + if (!Overflow && isConstantLegalOrBeforeLegalizer(DstTy)) { + if (IsSigned) { + MatchInfo = [=](MachineIRBuilder &B) { + auto ConstRHS = B.buildConstant(DstTy, NewC); + B.buildSAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS); + }; + return true; + } + // !IsSigned + MatchInfo = [=](MachineIRBuilder &B) { + auto ConstRHS = B.buildConstant(DstTy, NewC); + B.buildUAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS); + }; + return true; + } + } + }; + + // We try to combine addo to non-overflowing add. + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) || + !isConstantLegalOrBeforeLegalizer(CarryTy)) + return false; + + // We try to combine uaddo to non-overflowing add. + if (!IsSigned) { + ConstantRange CRLHS = + ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/false); + ConstantRange CRRHS = + ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/false); + + switch (CRLHS.unsignedAddMayOverflow(CRRHS)) { + case ConstantRange::OverflowResult::MayOverflow: + return false; + case ConstantRange::OverflowResult::NeverOverflows: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap); + B.buildConstant(Carry, 0); + }; + return true; + } + case ConstantRange::OverflowResult::AlwaysOverflowsLow: + case ConstantRange::OverflowResult::AlwaysOverflowsHigh: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildAdd(Dst, LHS, RHS); + B.buildConstant(Carry, 1); + }; + return true; + } + } + return false; + } + + // We try to combine saddo to non-overflowing add. + + // If LHS and RHS each have at least two sign bits, then there is no signed + // overflow. + if (KB->computeNumSignBits(RHS) > 1 && KB->computeNumSignBits(LHS) > 1) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap); + B.buildConstant(Carry, 0); + }; + return true; + } + + ConstantRange CRLHS = + ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/true); + ConstantRange CRRHS = + ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/true); + + switch (CRLHS.signedAddMayOverflow(CRRHS)) { + case ConstantRange::OverflowResult::MayOverflow: + return false; + case ConstantRange::OverflowResult::NeverOverflows: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap); + B.buildConstant(Carry, 0); + }; + return true; + } + case ConstantRange::OverflowResult::AlwaysOverflowsLow: + case ConstantRange::OverflowResult::AlwaysOverflowsHigh: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildAdd(Dst, LHS, RHS); + B.buildConstant(Carry, 1); + }; + return true; + } + } + + return false; +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir new file mode 100644 index 0000000000000..6fced31a622d9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir @@ -0,0 +1,94 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: add_unused +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: add_unused + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %add:_(s32), %o:_(s1) = G_SADDO %0, %1 + $w0 = COPY %add(s32) + RET_ReallyLR implicit $w0 +... +--- +name: add_canon +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: add_canon + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: %add:_(s32), %o:_(s1) = G_SADDO [[COPY]], %const + ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %const:_(s32) = G_CONSTANT i32 10 + %add:_(s32), %o:_(s1) = G_SADDO %const, %1 + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %add(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: add_const_fold +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: add_const_fold + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %add:_(s32) = G_CONSTANT i32 21 + ; CHECK-NEXT: %o_wide:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %const:_(s32) = G_CONSTANT i32 10 + %const1:_(s32) = G_CONSTANT i32 11 + %add:_(s32), %o:_(s1) = G_UADDO %const, %const1 + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %add(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: add_add_zero +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: add_add_zero + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: $w0 = COPY [[COPY]](s32) + ; CHECK-NEXT: $w1 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s32) = COPY $w2 + %const:_(s32) = G_CONSTANT i32 10 + %addl:_(s32) = nsw G_ADD %2, %const + %const1:_(s32) = G_CONSTANT i32 -10 + %add:_(s32), %o:_(s1) = G_SADDO %addl, %const1 + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %add(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir index 94f56e5650b22..9483cbf06f405 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="addo_by_0" -global-isel -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="match_addos" -global-isel -verify-machineinstrs %s -o - | FileCheck %s # REQUIRES: asserts # (G_*ADDO x, 0) -> x + no carry diff --git a/llvm/test/CodeGen/AArch64/overflow.ll b/llvm/test/CodeGen/AArch64/overflow.ll index 444aaeb0f3fe7..1fd60c0309790 100644 --- a/llvm/test/CodeGen/AArch64/overflow.ll +++ b/llvm/test/CodeGen/AArch64/overflow.ll @@ -19,20 +19,12 @@ entry: } define zeroext i1 @saddo1.i32.fold(i32 %v1, i32 %v2, ptr %res) { -; SDAG-LABEL: saddo1.i32.fold: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: mov w8, #20 // =0x14 -; SDAG-NEXT: mov w0, wzr -; SDAG-NEXT: str w8, [x2] -; SDAG-NEXT: ret -; -; GISEL-LABEL: saddo1.i32.fold: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: mov w8, #9 // =0x9 -; GISEL-NEXT: adds w8, w8, #11 -; GISEL-NEXT: cset w0, vs -; GISEL-NEXT: str w8, [x2] -; GISEL-NEXT: ret +; CHECK-LABEL: saddo1.i32.fold: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #20 // =0x14 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: str w8, [x2] +; CHECK-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 9, i32 11) %val = extractvalue {i32, i1} %t, 0 @@ -123,18 +115,11 @@ entry: } define zeroext i1 @saddo.canon.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) { -; SDAG-LABEL: saddo.canon.i32: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: mov w0, wzr -; SDAG-NEXT: str w4, [x5] -; SDAG-NEXT: ret -; -; GISEL-LABEL: saddo.canon.i32: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: adds w8, wzr, w4 -; GISEL-NEXT: cset w0, vs -; GISEL-NEXT: str w8, [x5] -; GISEL-NEXT: ret +; CHECK-LABEL: saddo.canon.i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: str w4, [x5] +; CHECK-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 0, i32 %v5) %val = extractvalue {i32, i1} %t, 0 @@ -143,13 +128,19 @@ entry: ret i1 %obit } define zeroext i1 @saddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) { -; CHECK-LABEL: saddo.add.i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add w8, w4, #100 -; CHECK-NEXT: subs w8, w8, #100 -; CHECK-NEXT: cset w0, vs -; CHECK-NEXT: str w8, [x5] -; CHECK-NEXT: ret +; SDAG-LABEL: saddo.add.i32: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add w8, w4, #100 +; SDAG-NEXT: subs w8, w8, #100 +; SDAG-NEXT: cset w0, vs +; SDAG-NEXT: str w8, [x5] +; SDAG-NEXT: ret +; +; GISEL-LABEL: saddo.add.i32: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: mov w0, wzr +; GISEL-NEXT: str w4, [x5] +; GISEL-NEXT: ret entry: %lhs = add nsw i32 %v5, 100 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %lhs, i32 -100) @@ -160,13 +151,20 @@ entry: } define zeroext i1 @uaddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) { -; CHECK-LABEL: uaddo.add.i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add w8, w4, #5 -; CHECK-NEXT: adds w8, w8, #5 -; CHECK-NEXT: cset w0, hs -; CHECK-NEXT: str w8, [x5] -; CHECK-NEXT: ret +; SDAG-LABEL: uaddo.add.i32: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add w8, w4, #5 +; SDAG-NEXT: adds w8, w8, #5 +; SDAG-NEXT: cset w0, hs +; SDAG-NEXT: str w8, [x5] +; SDAG-NEXT: ret +; +; GISEL-LABEL: uaddo.add.i32: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: adds w8, w4, #10 +; GISEL-NEXT: cset w0, hs +; GISEL-NEXT: str w8, [x5] +; GISEL-NEXT: ret entry: %lhs = add nuw i32 %v5, 5 %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %lhs, i32 5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index d36f5c0ea89d9..a6f9bb7ee055d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4142,11 +4142,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4162,7 +4162,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4179,7 +4179,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15 -; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX6-NEXT: s_addk_i32 s2, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15 -; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-LABEL: saddsat_i48_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4529,11 +4529,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4546,7 +4546,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4560,7 +4560,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4866,21 +4866,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4896,10 +4895,10 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo @@ -4921,8 +4920,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 @@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 @@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 @@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s8, s1, s0 ; GFX10-NEXT: s_add_u32 s0, s2, s6 ; GFX10-NEXT: s_addc_u32 s1, s3, s7 @@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s1, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31 -; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s8, s1, s0 ; GFX11-NEXT: s_add_u32 s0, s2, s6 ; GFX11-NEXT: s_addc_u32 s1, s3, s7 @@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s1, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5132,7 +5131,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 @@ -5179,7 +5178,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s9, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -5226,7 +5225,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s9, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 @@ -5269,7 +5268,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5310,7 +5309,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_ashr_i32 s0, s9, 31 -; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5412,9 +5411,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX9-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v3, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -5440,7 +5438,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -5467,7 +5465,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX11-NEXT: v_add_co_u32 v6, null, 0x80000000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -5569,9 +5567,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5597,9 +5594,9 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5627,15 +5624,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5762,12 +5758,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc @@ -5786,11 +5781,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc @@ -5832,18 +5827,18 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4 @@ -5882,18 +5877,17 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0 @@ -5927,7 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s17, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -5960,7 +5954,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 @@ -6011,7 +6005,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s17, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -6050,7 +6044,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -6101,7 +6095,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s17, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -6140,7 +6134,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -6184,7 +6178,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s10, s17, 31 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: s_add_u32 s11, s10, 0x80000000 +; GFX10-NEXT: s_add_i32 s11, s10, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: s_add_u32 s0, s4, s12 @@ -6221,7 +6215,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo @@ -6261,7 +6255,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_ashr_i32 s10, s17, 31 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX11-NEXT: s_add_u32 s11, s10, 0x80000000 +; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: s_add_u32 s0, s4, s12 @@ -6299,7 +6293,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 0a6b7af2f78d4..84906c01a4698 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -3091,253 +3091,252 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v1 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6] -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v11, v7 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2 -; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v2 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v2 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v6, v1 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v4 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v10 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6] +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2] -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2] +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], 0, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 +; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index c455b24313ddc..83ebc84e1f84a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3034,253 +3034,251 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 -; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1] ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9] +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v5, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v10 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1 +; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v2 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v6, v4 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2 +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v5 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v13, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v18, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 -; GISEL-NEXT: v_mul_hi_u32 v19, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v6, v18, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v10, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v15, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v5 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v0 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v4 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v14, v4 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v7 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v0, v6 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v4 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v3, v2 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v6, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 61e1e67b7ae36..320dfbb4980e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4142,11 +4142,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4162,7 +4162,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4179,7 +4179,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15 -; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX6-NEXT: s_addk_i32 s2, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15 -; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-LABEL: ssubsat_i48_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4529,11 +4529,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4546,7 +4546,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4560,7 +4560,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4866,21 +4866,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4896,10 +4895,10 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo @@ -4921,8 +4920,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 @@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 @@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 @@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s8, s1, s0 ; GFX10-NEXT: s_sub_u32 s0, s2, s6 ; GFX10-NEXT: s_subb_u32 s1, s3, s7 @@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s1, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31 -; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s8, s1, s0 ; GFX11-NEXT: s_sub_u32 s0, s2, s6 ; GFX11-NEXT: s_subb_u32 s1, s3, s7 @@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s1, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5134,7 +5133,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s11, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -5183,7 +5182,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s11, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -5232,7 +5231,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s11, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -5274,7 +5273,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 @@ -5317,7 +5316,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 ; GFX11-NEXT: v_mov_b32_e32 v3, s11 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 @@ -5427,9 +5426,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5456,7 +5454,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5484,8 +5482,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5594,9 +5591,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5625,7 +5621,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5652,12 +5648,12 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s4 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5805,9 +5801,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc @@ -5831,8 +5826,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc @@ -5877,18 +5872,18 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4 @@ -5931,18 +5926,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6 +; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 -; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 @@ -5978,7 +5971,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s19, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 @@ -6013,7 +6006,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 @@ -6066,7 +6059,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s19, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 @@ -6107,7 +6100,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -6160,7 +6153,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s19, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 @@ -6201,7 +6194,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -6244,7 +6237,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: s_ashr_i32 s8, s17, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000 +; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 @@ -6273,7 +6266,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: s_ashr_i32 s4, s3, 31 ; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 @@ -6326,7 +6319,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: s_ashr_i32 s8, s19, 31 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_add_u32 s9, s8, 0x80000000 +; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 @@ -6357,7 +6350,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 887c43f5fce59..d15551365707b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2062,13 +2062,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 @@ -2077,10 +2073,6 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 5c6bb6dea1646..07480a0ce0c2e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2480,13 +2480,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 @@ -2495,10 +2491,6 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10