Skip to content

Commit a4516da

Browse files
authored
[AArch64] - Fold and and cmp into tst (#110347)
Fixes llvm/llvm-project#102703. https://godbolt.org/z/nfj8xsb1Y The following pattern: ``` %2 = and i32 %0, 254 %3 = icmp eq i32 %2, 0 ``` is optimised by instcombine into: ```%3 = icmp ult i32 %0, 2``` However, post instcombine leads to worse aarch64 than the unoptimised version. Pre instcombine: ``` tst w0, #0xfe cset w0, eq ret ``` Post instcombine: ``` and w8, w0, #0xff cmp w8, #2 cset w0, lo ret ``` In the unoptimised version, SelectionDAG converts `SETCC (AND X 254) 0 EQ` into `CSEL 0 1 1 (ANDS X 254)`, which gets emitted as a `tst`. In the optimised version, SelectionDAG converts `SETCC (AND X 255) 2 ULT` into `CSEL 0 1 2 (SUBS (AND X 255) 2)`, which gets emitted as an `and`/`cmp`. This PR adds an optimisation to `AArch64ISelLowering`, converting `SETCC (AND X Y) Z ULT` into `SETCC (AND X (Y & ~(Z - 1))) 0 EQ` when `Z` is a power of two. This makes SelectionDAG/Codegen produce the same optimised code for both examples.
1 parent b0c9f02 commit a4516da

File tree

3 files changed

+252
-15
lines changed

3 files changed

+252
-15
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+26
Original file line numberDiff line numberDiff line change
@@ -4301,6 +4301,29 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
43014301
Op.getOperand(1));
43024302
}
43034303

4304+
// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4305+
// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4306+
// (AND X Y) Z which produces a better opt with EmitComparison
4307+
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS,
4308+
SelectionDAG &DAG, const SDLoc dl) {
4309+
if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4310+
ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4311+
ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4312+
if (LHSConstOp && RHSConst) {
4313+
uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4314+
uint64_t RHSConstant = RHSConst->getZExtValue();
4315+
if (isPowerOf2_64(RHSConstant)) {
4316+
uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4317+
LHS =
4318+
DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0),
4319+
DAG.getConstant(NewMaskValue, dl, LHS.getValueType()));
4320+
RHS = DAG.getConstant(0, dl, RHS.getValueType());
4321+
CC = ISD::SETEQ;
4322+
}
4323+
}
4324+
}
4325+
}
4326+
43044327
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
43054328
SelectionDAG &DAG) const {
43064329
EVT VT = Op.getValueType();
@@ -10596,6 +10619,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
1059610619
}
1059710620

1059810621
if (LHS.getValueType().isInteger()) {
10622+
10623+
simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
10624+
1059910625
SDValue CCVal;
1060010626
SDValue Cmp = getAArch64Cmp(
1060110627
LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
3+
4+
5+
define i1 @lt8_u8(i8 %0) {
6+
; CHECK-LABEL: lt8_u8:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: tst w0, #0xf8
9+
; CHECK-NEXT: cset w0, eq
10+
; CHECK-NEXT: ret
11+
%2 = icmp ult i8 %0, 8
12+
ret i1 %2
13+
}
14+
15+
define i1 @lt32_u8(i8 %0) {
16+
; CHECK-LABEL: lt32_u8:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: tst w0, #0xe0
19+
; CHECK-NEXT: cset w0, eq
20+
; CHECK-NEXT: ret
21+
%2 = icmp ult i8 %0, 32
22+
ret i1 %2
23+
}
24+
25+
define i1 @lt64_u8(i8 %0) {
26+
; CHECK-LABEL: lt64_u8:
27+
; CHECK: // %bb.0:
28+
; CHECK-NEXT: tst w0, #0xc0
29+
; CHECK-NEXT: cset w0, eq
30+
; CHECK-NEXT: ret
31+
%2 = icmp ult i8 %0, 64
32+
ret i1 %2
33+
}
34+
35+
define i1 @lt8_u32(i32 %0) {
36+
; CHECK-LABEL: lt8_u32:
37+
; CHECK: // %bb.0:
38+
; CHECK-NEXT: cmp w0, #8
39+
; CHECK-NEXT: cset w0, lo
40+
; CHECK-NEXT: ret
41+
%2 = icmp ult i32 %0, 8
42+
ret i1 %2
43+
}
44+
45+
define i1 @lt32_u32(i32 %0) {
46+
; CHECK-LABEL: lt32_u32:
47+
; CHECK: // %bb.0:
48+
; CHECK-NEXT: cmp w0, #32
49+
; CHECK-NEXT: cset w0, lo
50+
; CHECK-NEXT: ret
51+
%2 = icmp ult i32 %0, 32
52+
ret i1 %2
53+
}
54+
55+
define i1 @lt64_u32(i32 %0) {
56+
; CHECK-LABEL: lt64_u32:
57+
; CHECK: // %bb.0:
58+
; CHECK-NEXT: cmp w0, #64
59+
; CHECK-NEXT: cset w0, lo
60+
; CHECK-NEXT: ret
61+
%2 = icmp ult i32 %0, 64
62+
ret i1 %2
63+
}
64+
65+
define i1 @lt8_u64(i64 %0) {
66+
; CHECK-LABEL: lt8_u64:
67+
; CHECK: // %bb.0:
68+
; CHECK-NEXT: cmp x0, #8
69+
; CHECK-NEXT: cset w0, lo
70+
; CHECK-NEXT: ret
71+
%2 = icmp ult i64 %0, 8
72+
ret i1 %2
73+
}
74+
75+
define i1 @lt32_u64(i64 %0) {
76+
; CHECK-LABEL: lt32_u64:
77+
; CHECK: // %bb.0:
78+
; CHECK-NEXT: cmp x0, #32
79+
; CHECK-NEXT: cset w0, lo
80+
; CHECK-NEXT: ret
81+
%2 = icmp ult i64 %0, 32
82+
ret i1 %2
83+
}
84+
85+
define i1 @lt64_u64(i64 %0) {
86+
; CHECK-LABEL: lt64_u64:
87+
; CHECK: // %bb.0:
88+
; CHECK-NEXT: cmp x0, #64
89+
; CHECK-NEXT: cset w0, lo
90+
; CHECK-NEXT: ret
91+
%2 = icmp ult i64 %0, 64
92+
ret i1 %2
93+
}
94+
95+
define i1 @lt8_u16_and_5(i8 %0) {
96+
; CHECK-LABEL: lt8_u16_and_5:
97+
; CHECK: // %bb.0:
98+
; CHECK-NEXT: mov w8, wzr
99+
; CHECK-NEXT: cmp w8, #0
100+
; CHECK-NEXT: cset w0, eq
101+
; CHECK-NEXT: ret
102+
%2 = and i8 %0, 5
103+
%3 = icmp ult i8 %2, 16
104+
ret i1 %3
105+
}
106+
107+
define i1 @lt8_u16_and_19(i8 %0) {
108+
; CHECK-LABEL: lt8_u16_and_19:
109+
; CHECK: // %bb.0:
110+
; CHECK-NEXT: tst w0, #0x10
111+
; CHECK-NEXT: cset w0, eq
112+
; CHECK-NEXT: ret
113+
%2 = and i8 %0, 19
114+
%3 = icmp ult i8 %2, 16
115+
ret i1 %3
116+
}
117+
118+
define i1 @lt32_u16_and_7(i32 %0) {
119+
; CHECK-LABEL: lt32_u16_and_7:
120+
; CHECK: // %bb.0:
121+
; CHECK-NEXT: mov w8, wzr
122+
; CHECK-NEXT: cmp w8, #0
123+
; CHECK-NEXT: cset w0, eq
124+
; CHECK-NEXT: ret
125+
%2 = and i32 %0, 7
126+
%3 = icmp ult i32 %2, 16
127+
ret i1 %3
128+
}
129+
130+
define i1 @lt32_u16_and_21(i32 %0) {
131+
; CHECK-LABEL: lt32_u16_and_21:
132+
; CHECK: // %bb.0:
133+
; CHECK-NEXT: tst w0, #0x10
134+
; CHECK-NEXT: cset w0, eq
135+
; CHECK-NEXT: ret
136+
%2 = and i32 %0, 21
137+
%3 = icmp ult i32 %2, 16
138+
ret i1 %3
139+
}
140+
141+
define i1 @lt64_u16_and_9(i64 %0) {
142+
; CHECK-LABEL: lt64_u16_and_9:
143+
; CHECK: // %bb.0:
144+
; CHECK-NEXT: mov x8, xzr
145+
; CHECK-NEXT: cmp x8, #0
146+
; CHECK-NEXT: cset w0, eq
147+
; CHECK-NEXT: ret
148+
%2 = and i64 %0, 9
149+
%3 = icmp ult i64 %2, 16
150+
ret i1 %3
151+
}
152+
153+
define i1 @lt64_u16_and_23(i64 %0) {
154+
; CHECK-LABEL: lt64_u16_and_23:
155+
; CHECK: // %bb.0:
156+
; CHECK-NEXT: tst x0, #0x10
157+
; CHECK-NEXT: cset w0, eq
158+
; CHECK-NEXT: ret
159+
%2 = and i64 %0, 23
160+
%3 = icmp ult i64 %2, 16
161+
ret i1 %3
162+
}
163+
164+
; negative test
165+
define i1 @lt3_u8(i8 %0) {
166+
; CHECK-LABEL: lt3_u8:
167+
; CHECK: // %bb.0:
168+
; CHECK-NEXT: and w8, w0, #0xff
169+
; CHECK-NEXT: cmp w8, #3
170+
; CHECK-NEXT: cset w0, lo
171+
; CHECK-NEXT: ret
172+
%2 = icmp ult i8 %0, 3
173+
ret i1 %2
174+
}
175+
176+
; negative test
177+
define i1 @lt3_u32(i32 %0) {
178+
; CHECK-LABEL: lt3_u32:
179+
; CHECK: // %bb.0:
180+
; CHECK-NEXT: cmp w0, #3
181+
; CHECK-NEXT: cset w0, lo
182+
; CHECK-NEXT: ret
183+
%2 = icmp ult i32 %0, 3
184+
ret i1 %2
185+
}
186+
187+
; negative test
188+
define i1 @lt3_u64(i64 %0) {
189+
; CHECK-LABEL: lt3_u64:
190+
; CHECK: // %bb.0:
191+
; CHECK-NEXT: cmp x0, #3
192+
; CHECK-NEXT: cset w0, lo
193+
; CHECK-NEXT: ret
194+
%2 = icmp ult i64 %0, 3
195+
ret i1 %2
196+
}
197+
198+
; negative test
199+
define i32 @lt32_u16_multiple_use(i32 %0) {
200+
; CHECK-LABEL: lt32_u16_multiple_use:
201+
; CHECK: // %bb.0:
202+
; CHECK-NEXT: mov w8, #21 // =0x15
203+
; CHECK-NEXT: mov w9, #10 // =0xa
204+
; CHECK-NEXT: and w8, w0, w8
205+
; CHECK-NEXT: cmp w8, #16
206+
; CHECK-NEXT: orr w8, w8, w9
207+
; CHECK-NEXT: cset w10, lo
208+
; CHECK-NEXT: mul w0, w8, w10
209+
; CHECK-NEXT: ret
210+
%2 = and i32 %0, 21
211+
%3 = icmp ult i32 %2, 16
212+
%4 = add i32 %2, 10
213+
%5 = zext i1 %3 to i32
214+
%6 = mul i32 %4, %5
215+
ret i32 %6
216+
}

llvm/test/CodeGen/AArch64/signed-truncation-check.ll

+10-15
Original file line numberDiff line numberDiff line change
@@ -287,9 +287,8 @@ define i1 @add_ultcmp_bad_i16_i8_add(i16 %x, i16 %y) nounwind {
287287
; CHECK-LABEL: add_ultcmp_bad_i16_i8_add:
288288
; CHECK: // %bb.0:
289289
; CHECK-NEXT: add w8, w0, w1
290-
; CHECK-NEXT: and w8, w8, #0xffff
291-
; CHECK-NEXT: cmp w8, #256
292-
; CHECK-NEXT: cset w0, lo
290+
; CHECK-NEXT: tst w8, #0xff00
291+
; CHECK-NEXT: cset w0, eq
293292
; CHECK-NEXT: ret
294293
%tmp0 = add i16 %x, %y
295294
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -328,9 +327,8 @@ define i1 @add_ultcmp_bad_i16_i8_c0notpoweroftwo(i16 %x) nounwind {
328327
; CHECK-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo:
329328
; CHECK: // %bb.0:
330329
; CHECK-NEXT: add w8, w0, #192
331-
; CHECK-NEXT: and w8, w8, #0xffff
332-
; CHECK-NEXT: cmp w8, #256
333-
; CHECK-NEXT: cset w0, lo
330+
; CHECK-NEXT: tst w8, #0xff00
331+
; CHECK-NEXT: cset w0, eq
334332
; CHECK-NEXT: ret
335333
%tmp0 = add i16 %x, 192 ; (1U << (8-1)) + (1U << (8-1-1))
336334
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -356,9 +354,8 @@ define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind {
356354
; CHECK-LABEL: add_ultcmp_bad_i16_i8_magic:
357355
; CHECK: // %bb.0:
358356
; CHECK-NEXT: add w8, w0, #64
359-
; CHECK-NEXT: and w8, w8, #0xffff
360-
; CHECK-NEXT: cmp w8, #256
361-
; CHECK-NEXT: cset w0, lo
357+
; CHECK-NEXT: tst w8, #0xff00
358+
; CHECK-NEXT: cset w0, eq
362359
; CHECK-NEXT: ret
363360
%tmp0 = add i16 %x, 64 ; 1U << (8-1-1)
364361
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -370,9 +367,8 @@ define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind {
370367
; CHECK-LABEL: add_ultcmp_bad_i16_i4:
371368
; CHECK: // %bb.0:
372369
; CHECK-NEXT: add w8, w0, #8
373-
; CHECK-NEXT: and w8, w8, #0xffff
374-
; CHECK-NEXT: cmp w8, #16
375-
; CHECK-NEXT: cset w0, lo
370+
; CHECK-NEXT: tst w8, #0xfff0
371+
; CHECK-NEXT: cset w0, eq
376372
; CHECK-NEXT: ret
377373
%tmp0 = add i16 %x, 8 ; 1U << (4-1)
378374
%tmp1 = icmp ult i16 %tmp0, 16 ; 1U << 4
@@ -384,9 +380,8 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind {
384380
; CHECK-LABEL: add_ultcmp_bad_i24_i8:
385381
; CHECK: // %bb.0:
386382
; CHECK-NEXT: add w8, w0, #128
387-
; CHECK-NEXT: and w8, w8, #0xffffff
388-
; CHECK-NEXT: cmp w8, #256
389-
; CHECK-NEXT: cset w0, lo
383+
; CHECK-NEXT: tst w8, #0xffff00
384+
; CHECK-NEXT: cset w0, eq
390385
; CHECK-NEXT: ret
391386
%tmp0 = add i24 %x, 128 ; 1U << (8-1)
392387
%tmp1 = icmp ult i24 %tmp0, 256 ; 1U << 8

0 commit comments

Comments
 (0)