-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[GlobalISel][AArch64] Add G_FPTOSI_SAT/G_FPTOUI_SAT #96297
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-llvm-globalisel Author: David Green (davemgreen) ChangesThis is an implementation of the saturating fp to int conversions for GlobalISel. On AArch64 the converstion instrctions work this way, producing saturating results. LegalizerHelper::lowerFPTOINT_SAT is ported from SDAG. AArch64 has a lot of existing tests for fptosi_sat, covering a wide range of types. I have tried to make most of them work all at once, but a few fall back due to other missing features such as f128 handling for min/max. Patch is 544.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96297.diff 19 Files Affected:
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 42f56348885b4..54a78745fa8f9 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -473,6 +473,11 @@ G_FPTOSI, G_FPTOUI, G_SITOFP, G_UITOFP
Convert between integer and floating point.
+G_FPTOSI_SAT, G_FPTOUI_SAT
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Saturating convert between integer and floating point.
+
G_FABS
^^^^^^
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 995031f7c00be..2d0773627aa4c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -818,6 +818,8 @@ class GCastOp : public GenericMachineInstr {
case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI:
+ case TargetOpcode::G_FPTOSI_SAT:
+ case TargetOpcode::G_FPTOUI_SAT:
case TargetOpcode::G_FPTRUNC:
case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_PTRTOINT:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 284f434fbb9b0..0d2572b982e34 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -393,6 +393,7 @@ class LegalizerHelper {
LegalizeResult lowerSITOFP(MachineInstr &MI);
LegalizeResult lowerFPTOUI(MachineInstr &MI);
LegalizeResult lowerFPTOSI(MachineInstr &MI);
+ LegalizeResult lowerFPTOINT_SAT(MachineInstr &MI);
LegalizeResult lowerFPTRUNC_F64_TO_F16(MachineInstr &MI);
LegalizeResult lowerFPTRUNC(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 92e05ee858a75..7429ca42640bb 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1965,6 +1965,16 @@ class MachineIRBuilder {
return buildInstr(TargetOpcode::G_FPTOSI, {Dst}, {Src0});
}
+ /// Build and insert \p Res = G_FPTOUI_SAT \p Src0
+ MachineInstrBuilder buildFPTOUI_SAT(const DstOp &Dst, const SrcOp &Src0) {
+ return buildInstr(TargetOpcode::G_FPTOUI_SAT, {Dst}, {Src0});
+ }
+
+ /// Build and insert \p Res = G_FPTOSI_SAT \p Src0
+ MachineInstrBuilder buildFPTOSI_SAT(const DstOp &Dst, const SrcOp &Src0) {
+ return buildInstr(TargetOpcode::G_FPTOSI_SAT, {Dst}, {Src0});
+ }
+
/// Build and insert \p Dst = G_INTRINSIC_ROUNDEVEN \p Src0, \p Src1
MachineInstrBuilder
buildIntrinsicRoundeven(const DstOp &Dst, const SrcOp &Src0,
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index df4b264af72a8..a82b3555ae924 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -665,6 +665,12 @@ HANDLE_TARGET_OPCODE(G_SITOFP)
/// Generic unsigned-int to float conversion
HANDLE_TARGET_OPCODE(G_UITOFP)
+/// Generic saturating float to signed-int conversion
+HANDLE_TARGET_OPCODE(G_FPTOSI_SAT)
+
+/// Generic saturating float to unsigned-int conversion
+HANDLE_TARGET_OPCODE(G_FPTOUI_SAT)
+
/// Generic FP absolute value.
HANDLE_TARGET_OPCODE(G_FABS)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 4abffe6476c85..eae1ff825a0d4 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -749,6 +749,18 @@ def G_UITOFP : GenericInstruction {
let hasSideEffects = false;
}
+def G_FPTOSI_SAT : GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src);
+ let hasSideEffects = false;
+}
+
+def G_FPTOUI_SAT : GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src);
+ let hasSideEffects = false;
+}
+
def G_FABS : GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src);
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 560d3b434d07d..be47f50a803e0 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -98,6 +98,8 @@ def : GINodeEquiv<G_FPTOSI, fp_to_sint>;
def : GINodeEquiv<G_FPTOUI, fp_to_uint>;
def : GINodeEquiv<G_SITOFP, sint_to_fp>;
def : GINodeEquiv<G_UITOFP, uint_to_fp>;
+def : GINodeEquiv<G_FPTOSI_SAT, fp_to_sint_sat>;
+def : GINodeEquiv<G_FPTOUI_SAT, fp_to_uint_sat>;
def : GINodeEquiv<G_FADD, fadd>;
def : GINodeEquiv<G_FSUB, fsub>;
def : GINodeEquiv<G_FMA, fma>;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c06b35a98e434..c80a90557a569 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2330,6 +2330,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
MachineInstr::copyFlagsFromInstruction(CI));
return true;
}
+ case Intrinsic::fptosi_sat:
+ MIRBuilder.buildFPTOSI_SAT(getOrCreateVReg(CI),
+ getOrCreateVReg(*CI.getArgOperand(0)));
+ return true;
+ case Intrinsic::fptoui_sat:
+ MIRBuilder.buildFPTOUI_SAT(getOrCreateVReg(CI),
+ getOrCreateVReg(*CI.getArgOperand(0)));
+ return true;
case Intrinsic::memcpy_inline:
return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMCPY_INLINE);
case Intrinsic::memcpy:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 430fcae731689..d9c243327e92a 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1715,6 +1715,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
}
case TargetOpcode::G_FPTOUI:
case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI_SAT:
+ case TargetOpcode::G_FPTOSI_SAT:
return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
case TargetOpcode::G_FPEXT:
if (TypeIdx != 0)
@@ -2358,7 +2360,8 @@ LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
LegalizerHelper::LegalizeResult
LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
- switch (MI.getOpcode()) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
default:
return UnableToLegalize;
case TargetOpcode::G_ATOMICRMW_XCHG:
@@ -2442,13 +2445,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
Register SrcReg = MI.getOperand(1).getReg();
// First extend the input.
- unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
- MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
+ unsigned ExtOpc = Opcode == TargetOpcode::G_CTTZ ||
+ Opcode == TargetOpcode::G_CTTZ_ZERO_UNDEF
? TargetOpcode::G_ANYEXT
: TargetOpcode::G_ZEXT;
auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
LLT CurTy = MRI.getType(SrcReg);
- unsigned NewOpc = MI.getOpcode();
+ unsigned NewOpc = Opcode;
if (NewOpc == TargetOpcode::G_CTTZ) {
// The count is the same in the larger type except if the original
// value was zero. This can be handled by setting the bit just off
@@ -2464,8 +2467,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
// Perform the operation at the larger size.
auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
// This is already the correct result for CTPOP and CTTZs
- if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
- MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
+ if (Opcode == TargetOpcode::G_CTLZ ||
+ Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
// The correct result is NewOp - (Difference in widety and current ty).
unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
MIBNewOp = MIRBuilder.buildSub(
@@ -2614,8 +2617,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
Observer.changingInstr(MI);
if (TypeIdx == 0) {
- unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
- TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+ unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
+ : TargetOpcode::G_ZEXT;
widenScalarSrc(MI, WideTy, 1, CvtOp);
widenScalarDst(MI, WideTy);
@@ -2697,6 +2700,47 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
else
widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+ Observer.changedInstr(MI);
+ return Legalized;
+ case TargetOpcode::G_FPTOSI_SAT:
+ case TargetOpcode::G_FPTOUI_SAT:
+ Observer.changingInstr(MI);
+
+ if (TypeIdx == 0) {
+ Register OldDst = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(OldDst);
+ Register ExtReg = MRI.createGenericVirtualRegister(WideTy);
+ Register NewDst;
+ MI.getOperand(0).setReg(ExtReg);
+ uint64_t ShortBits = Ty.getScalarSizeInBits();
+ uint64_t WideBits = WideTy.getScalarSizeInBits();
+ MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+ if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
+ // z = i16 fptosi_sat(a)
+ // ->
+ // x = i32 fptosi_sat(a)
+ // y = smin(x, 32767)
+ // z = smax(y, -32768)
+ auto MaxVal = MIRBuilder.buildConstant(
+ WideTy, APInt::getSignedMaxValue(ShortBits).sext(WideBits));
+ auto MinVal = MIRBuilder.buildConstant(
+ WideTy, APInt::getSignedMinValue(ShortBits).sext(WideBits));
+ Register MidReg =
+ MIRBuilder.buildSMin(WideTy, ExtReg, MaxVal).getReg(0);
+ NewDst = MIRBuilder.buildSMax(WideTy, MidReg, MinVal).getReg(0);
+ } else {
+ // z = i16 fptoui_sat(a)
+ // ->
+ // x = i32 fptoui_sat(a)
+ // y = smin(x, 65535)
+ auto MaxVal = MIRBuilder.buildConstant(
+ WideTy, APInt::getAllOnes(ShortBits).zext(WideBits));
+ NewDst = MIRBuilder.buildUMin(WideTy, ExtReg, MaxVal).getReg(0);
+ }
+ MIRBuilder.buildTrunc(OldDst, NewDst);
+ } else
+ widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
+
Observer.changedInstr(MI);
return Legalized;
case TargetOpcode::G_LOAD:
@@ -2921,7 +2965,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
case TargetOpcode::G_FLDEXP:
case TargetOpcode::G_STRICT_FLDEXP: {
if (TypeIdx == 0) {
- if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP)
+ if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
return UnableToLegalize;
Observer.changingInstr(MI);
@@ -3914,6 +3958,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
return lowerFPTOUI(MI);
case G_FPTOSI:
return lowerFPTOSI(MI);
+ case G_FPTOUI_SAT:
+ case G_FPTOSI_SAT:
+ return lowerFPTOINT_SAT(MI);
case G_FPTRUNC:
return lowerFPTRUNC(MI);
case G_FPOWI:
@@ -4713,6 +4760,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case G_UITOFP:
case G_FPTOSI:
case G_FPTOUI:
+ case G_FPTOSI_SAT:
+ case G_FPTOUI_SAT:
case G_INTTOPTR:
case G_PTRTOINT:
case G_ADDRSPACE_CAST:
@@ -5504,6 +5553,8 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI:
+ case TargetOpcode::G_FPTOSI_SAT:
+ case TargetOpcode::G_FPTOUI_SAT:
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP: {
Observer.changingInstr(MI);
@@ -7012,6 +7063,102 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
return Legalized;
}
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
+ auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+
+ bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
+ unsigned SatWidth = DstTy.getScalarSizeInBits();
+
+ // Determine minimum and maximum integer values and their corresponding
+ // floating-point values.
+ APInt MinInt, MaxInt;
+ if (IsSigned) {
+ MinInt = APInt::getSignedMinValue(SatWidth);
+ MaxInt = APInt::getSignedMaxValue(SatWidth);
+ } else {
+ MinInt = APInt::getMinValue(SatWidth);
+ MaxInt = APInt::getMaxValue(SatWidth);
+ }
+
+ const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
+ APFloat MinFloat(Semantics);
+ APFloat MaxFloat(Semantics);
+
+ APFloat::opStatus MinStatus =
+ MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
+ APFloat::opStatus MaxStatus =
+ MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
+ bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
+ !(MaxStatus & APFloat::opStatus::opInexact);
+
+ // If the integer bounds are exactly representable as floats and min/max are
+ // legal, emit a min+max+fptoi sequence. Otherwise we have to use a sequence
+ // of comparisons and selects.
+ bool MinMaxLegal = LI.isLegal({TargetOpcode::G_FMINNUM, SrcTy}) &&
+ LI.isLegal({TargetOpcode::G_FMAXNUM, SrcTy});
+ if (AreExactFloatBounds && MinMaxLegal) {
+ // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
+ auto Max = MIRBuilder.buildFMaxNum(
+ SrcTy, Src, MIRBuilder.buildFConstant(SrcTy, MinFloat));
+ // Clamp by MaxFloat from above. NaN cannot occur.
+ auto Min = MIRBuilder.buildFMinNum(
+ SrcTy, Max, MIRBuilder.buildFConstant(SrcTy, MaxFloat));
+ // Convert clamped value to integer. In the unsigned case we're done,
+ // because we mapped NaN to MinFloat, which will cast to zero.
+ if (!IsSigned) {
+ MIRBuilder.buildFPTOUI(Dst, Min);
+ MI.eraseFromParent();
+ return Legalized;
+ }
+
+ // Otherwise, select 0 if Src is NaN.
+ auto FpToInt = MIRBuilder.buildFPTOSI(DstTy, Min);
+ auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
+ DstTy.changeElementSize(1), Src, Src);
+ MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0),
+ FpToInt);
+ MI.eraseFromParent();
+ return Legalized;
+ }
+
+ // Result of direct conversion. The assumption here is that the operation is
+ // non-trapping and it's fine to apply it to an out-of-range value if we
+ // select it away later.
+ auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(DstTy, Src)
+ : MIRBuilder.buildFPTOUI(DstTy, Src);
+
+ // If Src ULT MinFloat, select MinInt. In particular, this also selects
+ // MinInt if Src is NaN.
+ auto ULT =
+ MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, SrcTy.changeElementSize(1), Src,
+ MIRBuilder.buildFConstant(SrcTy, MinFloat));
+ auto Max = MIRBuilder.buildSelect(
+ DstTy, ULT, MIRBuilder.buildConstant(DstTy, MinInt), FpToInt);
+ // If Src OGT MaxFloat, select MaxInt.
+ auto OGT =
+ MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, SrcTy.changeElementSize(1), Src,
+ MIRBuilder.buildFConstant(SrcTy, MaxFloat));
+
+ // In the unsigned case we are done, because we mapped NaN to MinInt, which
+ // is already zero.
+ if (!IsSigned) {
+ MIRBuilder.buildSelect(Dst, OGT, MIRBuilder.buildConstant(DstTy, MaxInt),
+ Max);
+ MI.eraseFromParent();
+ return Legalized;
+ }
+
+ // Otherwise, select 0 if Src is NaN.
+ auto Min = MIRBuilder.buildSelect(
+ DstTy, OGT, MIRBuilder.buildConstant(DstTy, MaxInt), Max);
+ auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
+ DstTy.changeElementSize(1), Src, Src);
+ MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0), Min);
+ MI.eraseFromParent();
+ return Legalized;
+}
+
// f64 -> f16 conversion using round-to-nearest-even rounding mode.
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index d32007ec45fb6..9966c0f7c0450 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2130,6 +2130,12 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
}
return false;
}
+ case TargetOpcode::G_FPTOSI_SAT:
+ I.setDesc(TII.get(TargetOpcode::G_FPTOSI));
+ return true;
+ case TargetOpcode::G_FPTOUI_SAT:
+ I.setDesc(TII.get(TargetOpcode::G_FPTOUI));
+ return true;
default:
return false;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index fef0b722efe45..f0444416f84f7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -708,6 +708,56 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.libcallFor(
{{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
+ getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
+ .legalFor({{s32, s32},
+ {s64, s32},
+ {s32, s64},
+ {s64, s64},
+ {v2s64, v2s64},
+ {v4s32, v4s32},
+ {v2s32, v2s32}})
+ .legalIf([=](const LegalityQuery &Query) {
+ return HasFP16 &&
+ (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
+ Query.Types[1] == v8s16) &&
+ (Query.Types[0] == s32 || Query.Types[0] == s64 ||
+ Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
+ })
+ // Handle types larger than i64 by scalarizing/lowering.
+ .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+ .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
+ // The range of a fp16 value fits into an i17, so we can lower the width
+ // to i64.
+ .narrowScalarIf(
+ [=](const LegalityQuery &Query) {
+ return Query.Types[1] == s16 &&
+ Query.Types[0].getSizeInBits() > 64;
+ },
+ changeTo(0, s64))
+ .lowerIf(::any(scalarWiderThan(0, 64), scalarWiderThan(1, 64)), 0)
+ .moreElementsToNextPow2(0)
+ .widenScalarToNextPow2(0, /*MinSize=*/32)
+ .minScalar(0, s32)
+ .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ unsigned ITySize = Query.Types[0].getScalarSizeInBits();
+ return (ITySize == 16 || ITySize == 32 || ITySize == 64) &&
+ ITySize > Query.Types[1].getScalarSizeInBits();
+ },
+ LegalizeMutations::changeElementSizeTo(1, 0))
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ unsigned FTySize = Query.Types[1].getScalarSizeInBits();
+ return (FTySize == 16 || FTySize == 32 || FTySize == 64) &&
+ Query.Types[0].getScalarSizeInBits() < FTySize;
+ },
+ LegalizeMutations::changeElementSizeTo(0, 1))
+ .widenScalarOrEltToNextPow2(0)
+ .clampNumElements(0, v4s16, v8s16)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampMaxNumElements(0, s64, 2);
+
getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
.legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
.legalIf([=](const LegalityQuery &Query) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 5616d063f70bc..be5465a299b56 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -559,6 +559,8 @@ bool AArc...
[truncated]
|
Could you link/fixes #94694 and maybe take the test case? |
I believe that whilst #94694 is currently related to fptosi.sat, the same issue with infinite combines would come up from any other generic intrinsic. Prior to this patch GISel didn't know anything special about fptosi.sat, so it was just using a G_INTRINSIC and I believe the same problem would still exist for anything that wasn't now transformed into a G_FPTOSI_SAT. |
bool MinMaxLegal = LI.isLegal({TargetOpcode::G_FMINNUM, SrcTy}) && | ||
LI.isLegal({TargetOpcode::G_FMAXNUM, SrcTy}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exact legality isn't exactly the right check. Ideally we would figure out if the operation would ultimately be lowered to a fmin/fmax.
Looking at this again, below you are using fcmp / select. We should probably just pick one or the other as the canonical form, and let the nnan help optimizations convert into the appropriate flavor of min/max
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The main difference between the two forms is whether we can clamp the float (because AreExactFloatBounds), or whether we need to compare the original integer to clamp the floats.
I could remove the MinMaxLegal check and always produce the min/max even if it is illegal. It likely just means a slightly different soft-fp call.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I don't think this should be conditionalized. Just pick one as the canonical no-nan min/max case. If we need the conditions, it should be done as a separate change.
We do have a possible issue that we don't have an optimizing combiner mid-legalize, so the target won't get a chance to morph whatever the canonical nnan fmin/fmax is into whichever form it prefers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping here, the legality shouldn't matter. Do the cmp + select with nnan and let the combiners figure it out?
@@ -2130,6 +2130,12 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { | |||
} | |||
return false; | |||
} | |||
case TargetOpcode::G_FPTOSI_SAT: | |||
I.setDesc(TII.get(TargetOpcode::G_FPTOSI)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would hope this would have been a trivial selection pattern?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes - I had forgotten I had added this. The SDAG nodes take an extra ValueType variable to specify the width of the saturation, but as far as I can tell that isn't very useful as the instructions will always saturate to the maximum width (and it makes some parts of type legalization more difficult).
It does mean that the existing fp_to_sint_sat nodes will not work naturally. So this just changed the nodes as at this point we know the AArch64 G_FPTOSI instructions will be saturating, and all the existing patterns could apply.
This needs tightening up, but I'm not sure if it can work with the existing patterns work - they don't get imported by the gisel importer as far as I understand.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be necessary if the floating point type required legalization. We probably need an equivalent operand here, and then some TableGen magic to get the patterns to work
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Although maybe that would be better handled by having it clamp during the legalization?
@@ -708,6 +708,55 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) | |||
.libcallFor( | |||
{{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}}); | |||
|
|||
getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is one complicated rule set!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah - fptosi_sat is likely to be one of the more complex ones. (Note that in the future scalable vectors and bf16 could also add extra complexity). I do wonder if having to handle every different type combo in these definitions is the best, or whether there should be some overarching definition for types that is applied to every operation before the individual rules kick in.
8e9456d
to
ca33763
Compare
ca33763
to
91665b7
Compare
Rebase and ping, thanks. |
afac44a
to
3095a5d
Compare
// is already zero. | ||
if (!IsSigned) { | ||
MIRBuilder.buildSelect(Dst, OGT, MIRBuilder.buildConstant(DstTy, MaxInt), | ||
Max, MachineInstr::FmNoNans); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should the fcmp also get the flag?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC the FCmp is based on the original value, so could still be Nan. This select appears to be integer though, so presumably shouldn't have FMF's.
3095a5d
to
975d1ca
Compare
This is an implementation of the saturating fp to int conversions for GlobalISel. On AArch64 the converstion instrctions work this way, producing saturating results. LegalizerHelper::lowerFPTOINT_SAT is ported from SDAG. AArch64 has a lot of existing tests for fptosi_sat, covering a wide range of types. I have tried to make most of them work all at once, but a few fall back due to other missing features such as f128 handling for min/max.
975d1ca
to
9ea9b04
Compare
This is an implementation of the saturating fp to int conversions for GlobalISel. On AArch64 the converstion instrctions work this way, producing saturating results. LegalizerHelper::lowerFPTOINT_SAT is ported from SDAG.
AArch64 has a lot of existing tests for fptosi_sat, covering a wide range of types. I have tried to make most of them work all at once, but a few fall back due to other missing features such as f128 handling for min/max.