-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DAG] Handle truncated splat in isBoolConstant #145473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This allows truncated splat / buildvector in isBoolConstant, to allow certain not instructions to be recognized post-legalization, and allow vselect to optimize. An override for x86 avx512 predicated vectors is required to avoid an infinite recursion from the code that detects zero vectors. From: ``` // Check if the first operand is all zeros and Cond type is vXi1. // If this an avx512 target we can improve the use of zero masking by // swapping the operands and inverting the condition. ```
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThis allows truncated splat / buildvector in isBoolConstant, to allow certain not instructions to be recognized post-legalization, and allow vselect to optimize. An override for x86 avx512 predicated vectors is required to avoid an infinite recursion from the code that detects zero vectors. From:
Patch is 314.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145473.diff 13 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index a98e46c587273..5096e0bd70e6e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2479,8 +2479,7 @@ class SelectionDAG {
/// Check if a value \op N is a constant using the target's BooleanContent for
/// its type.
- LLVM_ABI std::optional<bool>
- isBoolConstant(SDValue N, bool AllowTruncation = false) const;
+ LLVM_ABI std::optional<bool> isBoolConstant(SDValue N) const;
/// Set CallSiteInfo to be associated with Node.
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 727526055e592..f92d5f4ee79fe 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4375,6 +4375,8 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
Op.getOpcode() == ISD::SPLAT_VECTOR_PARTS;
}
+ virtual bool isTargetCanonicalSelect(SDNode *N) const { return false; }
+
struct DAGCombinerInfo {
void *DC; // The DAG Combiner object.
CombineLevel Level;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6191e61791678..c9a493cdd7a89 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12967,8 +12967,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
return V;
// vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
- if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
- return DAG.getSelect(DL, VT, F, N2, N1);
+ if (!TLI.isTargetCanonicalSelect(N))
+ if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
+ return DAG.getSelect(DL, VT, F, N2, N1);
// select (sext m), (add X, C), X --> (add X, (and C, (sext m))))
if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N2 && N1->hasOneUse() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 30ee6a99b9dfc..3cdd2ac3a18d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -10349,7 +10349,7 @@ SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
// select true, T, F --> T
// select false, T, F --> F
- if (auto C = isBoolConstant(Cond, /*AllowTruncation=*/true))
+ if (auto C = isBoolConstant(Cond))
return *C ? T : F;
// select ?, T, T --> T
@@ -13562,13 +13562,14 @@ bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
return false;
}
-std::optional<bool> SelectionDAG::isBoolConstant(SDValue N,
- bool AllowTruncation) const {
- ConstantSDNode *Const = isConstOrConstSplat(N, false, AllowTruncation);
+std::optional<bool> SelectionDAG::isBoolConstant(SDValue N) const {
+ ConstantSDNode *Const =
+ isConstOrConstSplat(N, false, /*AllowTruncation=*/true);
if (!Const)
return std::nullopt;
- const APInt &CVal = Const->getAPIntValue();
+ EVT VT = N->getValueType(0);
+ const APInt CVal = Const->getAPIntValue().trunc(VT.getScalarSizeInBits());
switch (TLI->getBooleanContents(N.getValueType())) {
case TargetLowering::ZeroOrOneBooleanContent:
if (CVal.isOne())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2541182de1208..c0ed886475491 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4975,6 +4975,15 @@ X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
return getTargetConstantFromNode(LD);
}
+bool X86TargetLowering::isTargetCanonicalSelect(SDNode *N) const {
+ SDValue Cond = N->getOperand(0);
+ SDValue RHS = N->getOperand(2);
+ EVT CondVT = Cond.getValueType();
+ return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
+ CondVT.getVectorElementType() == MVT::i1 &&
+ ISD::isBuildVectorAllZeros(RHS.getNode());
+}
+
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 5cb6b3e493a32..20c90ebf9a5e2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1356,6 +1356,8 @@ namespace llvm {
TargetLowering::isTargetCanonicalConstantNode(Op);
}
+ bool isTargetCanonicalSelect(SDNode *N) const override;
+
const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
SDValue unwrapAddress(SDValue N) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 584c29ebcfc04..dfd0a05372b9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -16,16 +16,15 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov z2.d, #0xffffffff80000000
; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s
; CHECK-NEXT: mov z3.s, w8
; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT: mov z1.d, #0xffffffff80000000
; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s
; CHECK-NEXT: mov z3.d, #0x7fffffff
-; CHECK-NEXT: not p1.b, p0/z, p1.b
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; CHECK-NEXT: mov z1.d, p1/m, z2.d
; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d
@@ -40,16 +39,15 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov z2.s, #0x80000000
; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.s
; CHECK-NEXT: mov z3.s, w8
; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: fcvtzs z1.s, p0/m, z0.s
+; CHECK-NEXT: mov z1.s, #0x80000000
; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s
; CHECK-NEXT: mov z3.s, #0x7fffffff
-; CHECK-NEXT: not p1.b, p0/z, p1.b
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; CHECK-NEXT: mov z1.s, p1/m, z2.s
; CHECK-NEXT: sel z0.s, p2, z3.s, z1.s
@@ -69,27 +67,25 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov z6.s, #0x7fffffff
+; CHECK-NEXT: mov z3.s, #0x80000000
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT: mov z3.s, w8
; CHECK-NEXT: movprfx z4, z0
; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s
-; CHECK-NEXT: movprfx z5, z1
-; CHECK-NEXT: fcvtzs z5.s, p0/m, z1.s
+; CHECK-NEXT: mov z5.s, w8
+; CHECK-NEXT: movprfx z6, z1
+; CHECK-NEXT: fcvtzs z6.s, p0/m, z1.s
; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z2.s
; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z2.s
-; CHECK-NEXT: mov z2.s, #0x80000000
-; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z3.s
-; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z3.s
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: not p2.b, p0/z, p2.b
-; CHECK-NEXT: sel z3.s, p1, z2.s, z4.s
+; CHECK-NEXT: mov z2.s, #0x7fffffff
+; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z5.s
+; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z5.s
+; CHECK-NEXT: sel z4.s, p1, z4.s, z3.s
+; CHECK-NEXT: mov z3.s, p2/m, z6.s
; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z0.s
; CHECK-NEXT: fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT: sel z2.s, p2, z2.s, z5.s
-; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s
-; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s
+; CHECK-NEXT: sel z0.s, p3, z2.s, z4.s
+; CHECK-NEXT: sel z1.s, p4, z2.s, z3.s
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0
; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0
@@ -105,19 +101,19 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov z3.s, #-32768 // =0xffffffffffff8000
; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: mov w8, #65024 // =0xfe00
; CHECK-NEXT: movk w8, #18175, lsl #16
-; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.s
; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: fcvtzs z1.s, p0/m, z0.s
-; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT: mov z2.s, #32767 // =0x7fff
-; CHECK-NEXT: not p1.b, p0/z, p1.b
+; CHECK-NEXT: mov z1.s, w8
+; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: mov z1.s, #32767 // =0x7fff
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT: mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT: sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT: sel z2.s, p1, z2.s, z3.s
+; CHECK-NEXT: sel z0.s, p2, z1.s, z2.s
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
@@ -134,27 +130,26 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov z5.s, #32767 // =0x7fff
+; CHECK-NEXT: mov z6.s, #32767 // =0x7fff
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: mov w8, #65024 // =0xfe00
; CHECK-NEXT: movk w8, #18175, lsl #16
; CHECK-NEXT: movprfx z3, z1
; CHECK-NEXT: fcvtzs z3.s, p0/m, z1.s
-; CHECK-NEXT: movprfx z4, z0
-; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s
+; CHECK-NEXT: movprfx z5, z0
+; CHECK-NEXT: fcvtzs z5.s, p0/m, z0.s
+; CHECK-NEXT: mov z4.s, w8
; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s
; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT: mov z2.s, w8
-; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z2.s
-; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z2.s
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: not p2.b, p0/z, p2.b
-; CHECK-NEXT: mov z3.s, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT: mov z2.s, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z4.s
+; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z4.s
+; CHECK-NEXT: sel z3.s, p1, z3.s, z2.s
+; CHECK-NEXT: mov z2.s, p2/m, z5.s
; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT: mov z4.s, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT: sel z0.s, p3, z5.s, z3.s
-; CHECK-NEXT: sel z1.s, p4, z5.s, z4.s
+; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s
+; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0
; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0
@@ -171,16 +166,15 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov z2.d, #0x8000000000000000
; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s
; CHECK-NEXT: mov z3.s, w8
; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT: mov z1.d, #0x8000000000000000
; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s
; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT: not p1.b, p0/z, p1.b
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; CHECK-NEXT: mov z1.d, p1/m, z2.d
; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d
@@ -204,25 +198,23 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT: mov z3.s, w8
-; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff
-; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s
-; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT: mov z2.d, #0x8000000000000000
+; CHECK-NEXT: mov z3.d, #0x8000000000000000
+; CHECK-NEXT: mov z5.s, w8
; CHECK-NEXT: movprfx z4, z1
; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.s
-; CHECK-NEXT: movprfx z5, z0
-; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: not p2.b, p0/z, p2.b
-; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT: movprfx z6, z0
+; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.s
+; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z5.s
+; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z5.s
+; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d
; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT: mov z3.d, p2/m, z6.d
+; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d
+; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0
; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0
@@ -248,17 +240,16 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov z2.d, #0xffffffff80000000
; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000
; CHECK-NEXT: movk x8, #16863, lsl #48
+; CHECK-NEXT: movprfx z2, z0
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d
; CHECK-NEXT: mov z3.d, x8
; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.d
+; CHECK-NEXT: mov z1.d, #0xffffffff80000000
; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z3.d
; CHECK-NEXT: mov z3.d, #0x7fffffff
-; CHECK-NEXT: not p1.b, p0/z, p1.b
; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d
; CHECK-NEXT: mov z1.d, p1/m, z2.d
; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d
@@ -278,28 +269,26 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov z6.d, #0x7fffffff
+; CHECK-NEXT: mov z3.d, #0xffffffff80000000
; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000
; CHECK-NEXT: movk x8, #16863, lsl #48
; CHECK-NEXT: movprfx z4, z1
; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d
-; CHECK-NEXT: movprfx z5, z0
-; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.d
-; CHECK-NEXT: mov z3.d, x8
+; CHECK-NEXT: movprfx z6, z0
+; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.d
+; CHECK-NEXT: mov z5.d, x8
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d
-; CHECK-NEXT: mov z2.d, #0xffffffff80000000
-; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z3.d
-; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z3.d
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: not p2.b, p0/z, p2.b
-; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT: mov z2.d, #0x7fffffff
+; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z5.d
+; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z5.d
+; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d
+; CHECK-NEXT: mov z3.d, p2/m, z6.d
; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d
; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d
+; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0
; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0
@@ -327,49 +316,45 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
; CHECK-NEXT: mov z5.d, #0xffffffff80000000
; CHECK-NEXT: mov z4.d, x8
; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000
-; CHECK-NEXT: mov z26.d, #0x7fffffff
; CHECK-NEXT: movk x8, #16863, lsl #48
-; CHECK-NEXT: movprfx z7, z0
-; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d
-; CHECK-NEXT: movprfx z24, z3
-; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.d
+; CHECK-NEXT: movprfx z7, z1
+; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.d
+; CHECK-NEXT: movprfx z24, z0
+; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d
; CHECK-NEXT: mov z6.d, x8
-; CHECK-NEXT: movprfx z25, z2
-; CHECK-NEXT: fcvtzs z25.d, p0/m, z2.d
+; CHECK-NEXT: movprfx z25, z3
+; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.d
+; CHECK-NEXT: movprfx z26, z2
+; CHECK-NEXT: fcvtzs z26.d, p0/m, z2.d
; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d
; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d
; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d
; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d
-; CHECK-NEXT: movprfx z4, z1
-; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d
+; CHECK-NEXT: mov z4.d, #0x7fffffff
; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z6.d
; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z6.d
; CHECK-NEXT: fcmgt p7.d, p0/z, z3.d, z6.d
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: not p2.b, p0/z, p2.b
-; CHECK-NEXT: not p3.b, p0/z, p3.b
-; CHECK-NEXT: mov z4.d, p1/m, z5.d
+; CHECK-NEXT: sel z7.d, p1, z7.d, z5.d
; CHECK-NEXT: fcmgt p1.d, p0/z, z2.d, z6.d
-; CHECK-NEXT: not p4.b, p0/z, p4.b
-; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d
+; CHECK-NEXT: sel z6.d, p2, z24.d, z5.d
+; CHECK-NEXT: sel z24.d, p3, z25.d, z5.d
+; CHECK-NEXT: mov z5.d, p4/m, z26.d
; CHECK-NEXT: fcmuo p2.d, p0/z, z1.d, z1.d
-; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d
; CHECK-NEXT: fcmuo p3.d, p0/z, z0.d, z0.d
-; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d
; CHECK-NEXT: fcmuo p4.d, p0/z, z3.d, z3.d
; CHECK-NEXT: fcmuo p0.d, p0/z, z2.d, z2.d
-; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d
+; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d
+; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d
+; CHECK-NEXT: sel z3.d, p1, z4.d, z5.d
; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d
; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0
; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0
; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: uzp1 z1.s, z3.s, z2.s
; CHECK-NEXT: addvl sp, sp, #1
@@ -389,27 +374,26 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: mov x8, #-4548635623644200960 // =0xc0e0000000000000
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov z5.d, #32767 // =0x7fff
+; CHECK-NEXT: mov z6.d, #32767 // =0x7fff
; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000
; CHECK-NEXT: movk x8, #16607, lsl #48
; CHECK-NEXT: movprfx z3,...
[truncated]
|
@@ -4375,6 +4375,8 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { | |||
Op.getOpcode() == ISD::SPLAT_VECTOR_PARTS; | |||
} | |||
|
|||
virtual bool isTargetCanonicalSelect(SDNode *N) const { return false; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(style) add a description of what a canonical select means
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems like a very specific hack, can you do this another way
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could try moving the AVX512 fold to X86DAGToDAGISel / isel patterns? @phoebewang WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
X86DAGToDAGISel sounds good to me. We have hundreds of AVX512 instructions, adding patterns for them is too verbose.
212f3de
to
d84d9de
Compare
This allows truncated splat / buildvector in isBoolConstant, to allow certain not instructions to be recognized post-legalization, and allow vselect to optimize.
An override for x86 avx512 predicated vectors is required to avoid an infinite recursion from the code that detects zero vectors. From: