Skip to content

Commit f62d8a1

Browse files
[AArch64] Compare BFI and ORR with left-shifted operand for OR instruction selection.
Before this patch: - For `r = or op0, op1`, `tryBitfieldInsertOpFromOr` combines it to BFI when 1) one of the two operands is bit-field-positioning or bit-field-extraction op; and 2) bits from the two operands don't overlap After this patch: - Right before OR is combined to BFI, evaluates if ORR with left-shifted operand is better. A motivating example (https://godbolt.org/z/rnMrzs5vn, which is added as a test case in `test_orr_not_bfi` in `CodeGen/AArch64/bitfield-insert.ll`) For IR: ``` define i64 @test_orr_not_bfxil(i64 %0) { %2 = and i64 %0, 127 %3 = lshr i64 %0, 1 %4 = and i64 %3, 16256 %5 = or i64 %4, %2 ret i64 %5 } ``` Before: ``` lsr x8, x0, #1 and x8, x8, #0x3f80 bfxil x8, x0, #0, #7 ``` After: ``` ubfx x8, x0, #8, #7 and x9, x0, #0x7f orr x0, x9, x8, lsl #7 ``` Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D135102
1 parent 8086b0c commit f62d8a1

18 files changed

+241
-121
lines changed

Diff for: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

+124
Original file line numberDiff line numberDiff line change
@@ -2803,6 +2803,122 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
28032803
return true;
28042804
}
28052805

2806+
static bool isWorthFoldingIntoOrrWithLeftShift(SDValue Dst,
2807+
SelectionDAG *CurDAG,
2808+
SDValue &LeftShiftedOperand,
2809+
uint64_t &LeftShiftAmount) {
2810+
// Avoid folding Dst into ORR-with-left-shift if Dst has other uses than ORR.
2811+
if (!Dst.hasOneUse())
2812+
return false;
2813+
2814+
EVT VT = Dst.getValueType();
2815+
assert((VT == MVT::i32 || VT == MVT::i64) &&
2816+
"Caller should guarantee that VT is one of i32 or i64");
2817+
const unsigned SizeInBits = VT.getSizeInBits();
2818+
2819+
SDLoc DL(Dst.getNode());
2820+
uint64_t AndImm, ShlImm;
2821+
if (isOpcWithIntImmediate(Dst.getNode(), ISD::AND, AndImm) &&
2822+
isShiftedMask_64(AndImm)) {
2823+
// Avoid transforming 'DstOp0' if it has other uses than the AND node.
2824+
SDValue DstOp0 = Dst.getOperand(0);
2825+
if (!DstOp0.hasOneUse())
2826+
return false;
2827+
2828+
// An example to illustrate the transformation
2829+
// From:
2830+
// lsr x8, x1, #1
2831+
// and x8, x8, #0x3f80
2832+
// bfxil x8, x1, #0, #7
2833+
// To:
2834+
// and x8, x23, #0x7f
2835+
// ubfx x9, x23, #8, #7
2836+
// orr x23, x8, x9, lsl #7
2837+
//
2838+
// The number of instructions remains the same, but ORR is faster than BFXIL
2839+
// on many AArch64 processors (or as good as BFXIL if not faster). Besides,
2840+
// the dependency chain is improved after the transformation.
2841+
uint64_t SrlImm;
2842+
if (isOpcWithIntImmediate(DstOp0.getNode(), ISD::SRL, SrlImm)) {
2843+
uint64_t NumTrailingZeroInShiftedMask = countTrailingZeros(AndImm);
2844+
if ((SrlImm + NumTrailingZeroInShiftedMask) < SizeInBits) {
2845+
unsigned MaskWidth =
2846+
countTrailingOnes(AndImm >> NumTrailingZeroInShiftedMask);
2847+
unsigned UBFMOpc =
2848+
(VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2849+
SDNode *UBFMNode = CurDAG->getMachineNode(
2850+
UBFMOpc, DL, VT, DstOp0.getOperand(0),
2851+
CurDAG->getTargetConstant(SrlImm + NumTrailingZeroInShiftedMask, DL,
2852+
VT),
2853+
CurDAG->getTargetConstant(
2854+
SrlImm + NumTrailingZeroInShiftedMask + MaskWidth - 1, DL, VT));
2855+
LeftShiftedOperand = SDValue(UBFMNode, 0);
2856+
LeftShiftAmount = NumTrailingZeroInShiftedMask;
2857+
return true;
2858+
}
2859+
}
2860+
} else if (isOpcWithIntImmediate(Dst.getNode(), ISD::SHL, ShlImm)) {
2861+
LeftShiftedOperand = Dst.getOperand(0);
2862+
LeftShiftAmount = ShlImm;
2863+
return true;
2864+
}
2865+
// FIXME: Extend the implementation to optimize if Dst is an SRL node.
2866+
return false;
2867+
}
2868+
2869+
static bool tryOrrWithLeftShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
2870+
SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
2871+
const bool BiggerPattern) {
2872+
EVT VT = N->getValueType(0);
2873+
assert((VT == MVT::i32 || VT == MVT::i64) &&
2874+
"Expect result type to be i32 or i64 since N is combinable to BFM");
2875+
SDLoc DL(N);
2876+
2877+
// Bail out if BFM simplifies away one node in BFM Dst.
2878+
if (OrOpd1 != Dst)
2879+
return false;
2880+
2881+
// For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
2882+
// nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
2883+
if (BiggerPattern) {
2884+
uint64_t SrcAndImm;
2885+
if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::AND, SrcAndImm) &&
2886+
isMask_64(SrcAndImm) && OrOpd0.getOperand(0) == Src) {
2887+
// OrOpd0 = AND Src, #Mask
2888+
// So BFM simplifies away one AND node from Src and doesn't simplify away
2889+
// nodes from Dst. If ORR with left-shifted operand also simplifies away
2890+
// one node (from Rd), ORR is better since it has higher throughput and
2891+
// smaller latency than BFM on many AArch64 processors (and for the rest
2892+
// ORR is at least as good as BFM).
2893+
SDValue LeftShiftedOperand;
2894+
uint64_t LeftShiftAmount;
2895+
if (isWorthFoldingIntoOrrWithLeftShift(Dst, CurDAG, LeftShiftedOperand,
2896+
LeftShiftAmount)) {
2897+
unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
2898+
SDValue Ops[] = {OrOpd0, LeftShiftedOperand,
2899+
CurDAG->getTargetConstant(LeftShiftAmount, DL, VT)};
2900+
CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
2901+
return true;
2902+
}
2903+
}
2904+
return false;
2905+
}
2906+
2907+
assert((!BiggerPattern) && "BiggerPattern should be handled above");
2908+
2909+
uint64_t ShlImm;
2910+
// FIXME: Extend the implementation if OrOpd0 is an SRL node.
2911+
if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) &&
2912+
OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
2913+
unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
2914+
SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ShlImm, DL, VT)};
2915+
CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
2916+
return true;
2917+
}
2918+
2919+
return false;
2920+
}
2921+
28062922
static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
28072923
SelectionDAG *CurDAG) {
28082924
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
@@ -2905,6 +3021,14 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
29053021
// or is useful because it discards more bits
29063022
Dst = OrOpd1Val;
29073023

3024+
// Before selecting ISD::OR node to AArch64::BFM, see if an AArch64::ORR
3025+
// with left-shifted operand is more efficient.
3026+
// FIXME: Extend this to compare AArch64::BFM and AArch64::ORR with
3027+
// right-shifted operand as well.
3028+
if (tryOrrWithLeftShift(N, OrOpd0Val, OrOpd1Val, Src, Dst, CurDAG,
3029+
BiggerPattern))
3030+
return true;
3031+
29083032
// both parts match
29093033
SDLoc DL(N);
29103034
SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),

Diff for: llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll

+5-5
Original file line numberDiff line numberDiff line change
@@ -964,9 +964,9 @@ entry:
964964
define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
965965
; LLC-LABEL: test_ignored_rightbits:
966966
; LLC: // %bb.0:
967-
; LLC-NEXT: and w0, w0, #0x7
968-
; LLC-NEXT: bfi w0, w1, #3, #4
969-
; LLC-NEXT: bfi w0, w0, #8, #7
967+
; LLC-NEXT: and w8, w0, #0x7
968+
; LLC-NEXT: bfi w8, w1, #3, #4
969+
; LLC-NEXT: orr w0, w8, w8, lsl #8
970970
; LLC-NEXT: ret
971971
; OPT-LABEL: @test_ignored_rightbits(
972972
; OPT-NEXT: [[POSITIONED_FIELD:%.*]] = shl i32 [[IN:%.*]], 3
@@ -1000,8 +1000,8 @@ define void @sameOperandBFI(i64 %src, i64 %src2, i16 *%ptr) {
10001000
; LLC-NEXT: lsr x8, x0, #47
10011001
; LLC-NEXT: and w9, w1, #0x3
10021002
; LLC-NEXT: bfi w9, w8, #2, #2
1003-
; LLC-NEXT: bfi w9, w9, #4, #4
1004-
; LLC-NEXT: strh w9, [x2]
1003+
; LLC-NEXT: orr w8, w9, w9, lsl #4
1004+
; LLC-NEXT: strh w8, [x2]
10051005
; LLC-NEXT: .LBB30_2: // %end
10061006
; LLC-NEXT: ret
10071007
; OPT-LABEL: @sameOperandBFI(

Diff for: llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll

+15-15
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ define i24 @ldi24(ptr %p) nounwind {
55
; CHECK-LABEL: ldi24:
66
; CHECK: // %bb.0:
77
; CHECK-NEXT: ldrb w8, [x0, #2]
8-
; CHECK-NEXT: ldrh w0, [x0]
9-
; CHECK-NEXT: bfi w0, w8, #16, #16
8+
; CHECK-NEXT: ldrh w9, [x0]
9+
; CHECK-NEXT: orr w0, w9, w8, lsl #16
1010
; CHECK-NEXT: ret
1111
%r = load i24, i24* %p
1212
ret i24 %r
@@ -17,9 +17,9 @@ define i56 @ldi56(ptr %p) nounwind {
1717
; CHECK: // %bb.0:
1818
; CHECK-NEXT: ldrb w8, [x0, #6]
1919
; CHECK-NEXT: ldrh w9, [x0, #4]
20-
; CHECK-NEXT: ldr w0, [x0]
21-
; CHECK-NEXT: bfi w9, w8, #16, #16
22-
; CHECK-NEXT: bfi x0, x9, #32, #32
20+
; CHECK-NEXT: ldr w10, [x0]
21+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
22+
; CHECK-NEXT: orr x0, x10, x8, lsl #32
2323
; CHECK-NEXT: ret
2424
%r = load i56, i56* %p
2525
ret i56 %r
@@ -41,10 +41,10 @@ define i120 @ldi120(ptr %p) nounwind {
4141
; CHECK: // %bb.0:
4242
; CHECK-NEXT: ldrb w8, [x0, #14]
4343
; CHECK-NEXT: ldrh w9, [x0, #12]
44-
; CHECK-NEXT: ldr w1, [x0, #8]
44+
; CHECK-NEXT: ldr w10, [x0, #8]
4545
; CHECK-NEXT: ldr x0, [x0]
46-
; CHECK-NEXT: bfi w9, w8, #16, #16
47-
; CHECK-NEXT: bfi x1, x9, #32, #32
46+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
47+
; CHECK-NEXT: orr x1, x10, x8, lsl #32
4848
; CHECK-NEXT: ret
4949
%r = load i120, i120* %p
5050
ret i120 %r
@@ -55,10 +55,10 @@ define i280 @ldi280(ptr %p) nounwind {
5555
; CHECK: // %bb.0:
5656
; CHECK-NEXT: ldp x8, x1, [x0]
5757
; CHECK-NEXT: ldrb w9, [x0, #34]
58-
; CHECK-NEXT: ldrh w4, [x0, #32]
58+
; CHECK-NEXT: ldrh w10, [x0, #32]
5959
; CHECK-NEXT: ldp x2, x3, [x0, #16]
6060
; CHECK-NEXT: mov x0, x8
61-
; CHECK-NEXT: bfi x4, x9, #16, #8
61+
; CHECK-NEXT: orr x4, x10, x9, lsl #16
6262
; CHECK-NEXT: ret
6363
%r = load i280, i280* %p
6464
ret i280 %r
@@ -133,7 +133,7 @@ define void @i56_or(ptr %a) {
133133
; CHECK-NEXT: ldrh w10, [x8, #4]!
134134
; CHECK-NEXT: ldrb w11, [x8, #2]
135135
; CHECK-NEXT: orr w9, w9, #0x180
136-
; CHECK-NEXT: bfi w10, w11, #16, #16
136+
; CHECK-NEXT: orr w10, w10, w11, lsl #16
137137
; CHECK-NEXT: str w9, [x0]
138138
; CHECK-NEXT: strb w11, [x8, #2]
139139
; CHECK-NEXT: strh w10, [x8]
@@ -153,7 +153,7 @@ define void @i56_and_or(ptr %a) {
153153
; CHECK-NEXT: ldrb w11, [x8, #2]
154154
; CHECK-NEXT: orr w9, w9, #0x180
155155
; CHECK-NEXT: and w9, w9, #0xffffff80
156-
; CHECK-NEXT: bfi w10, w11, #16, #16
156+
; CHECK-NEXT: orr w10, w10, w11, lsl #16
157157
; CHECK-NEXT: strb w11, [x8, #2]
158158
; CHECK-NEXT: str w9, [x0]
159159
; CHECK-NEXT: strh w10, [x8]
@@ -172,11 +172,11 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
172172
; CHECK-NEXT: ldr w11, [x0]
173173
; CHECK-NEXT: ldrh w9, [x8, #4]!
174174
; CHECK-NEXT: ldrb w10, [x8, #2]
175-
; CHECK-NEXT: bfi w9, w10, #16, #8
175+
; CHECK-NEXT: orr w9, w9, w10, lsl #16
176176
; CHECK-NEXT: strb w10, [x8, #2]
177-
; CHECK-NEXT: bfi x11, x9, #32, #24
178-
; CHECK-NEXT: strh w9, [x8]
177+
; CHECK-NEXT: orr x11, x11, x9, lsl #32
179178
; CHECK-NEXT: and x11, x11, #0xffffffffffffdfff
179+
; CHECK-NEXT: strh w9, [x8]
180180
; CHECK-NEXT: orr w11, w11, w1, lsl #13
181181
; CHECK-NEXT: str w11, [x0]
182182
; CHECK-NEXT: ret

Diff for: llvm/test/CodeGen/AArch64/arm64-strict-align.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
define i32 @f0(i32* nocapture %p) nounwind {
66
; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
77
; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
8-
; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
8+
; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
99
; CHECK-STRICT: ret
1010

1111
; CHECK: ldr w0, [x0]
@@ -16,7 +16,7 @@ define i32 @f0(i32* nocapture %p) nounwind {
1616

1717
define i64 @f1(i64* nocapture %p) nounwind {
1818
; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
19-
; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
19+
; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32
2020
; CHECK-STRICT: ret
2121

2222
; CHECK: ldr x0, [x0]

Diff for: llvm/test/CodeGen/AArch64/arm64_32.ll

+3-2
Original file line numberDiff line numberDiff line change
@@ -662,8 +662,9 @@ define void @test_struct_hi(i32 %hi) nounwind {
662662
; CHECK-LABEL: test_struct_hi:
663663
; CHECK: mov w[[IN:[0-9]+]], w0
664664
; CHECK: bl _get_int
665-
; CHECK-FAST-NEXT: mov w0, w0
666-
; CHECK-NEXT: bfi x0, x[[IN]], #32, #32
665+
; CHECK-FAST-NEXT: mov w[[DST:[0-9]+]], w0
666+
; CHECK-FAST-NEXT: orr x0, x[[DST]], x[[IN]], lsl #32
667+
; CHECK-OPT-NEXT: bfi x0, x[[IN]], #32, #32
667668
; CHECK-NEXT: bl _take_pair
668669
%val.64 = call i64 @get_int()
669670
%val.32 = trunc i64 %val.64 to i32

Diff for: llvm/test/CodeGen/AArch64/bfis-in-loop.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ define i64 @bfis_in_loop_zero() {
2828
; CHECK-NEXT: ldr x11, [x9, #8]
2929
; CHECK-NEXT: and x9, x10, #0xff
3030
; CHECK-NEXT: and x10, x0, #0xffffffff00000000
31-
; CHECK-NEXT: bfi x9, x8, #8, #32
32-
; CHECK-NEXT: bfi x10, x12, #16, #1
31+
; CHECK-NEXT: orr x9, x9, x8, lsl #8
32+
; CHECK-NEXT: orr x10, x10, x12, lsl #16
3333
; CHECK-NEXT: orr x0, x10, x9
3434
; CHECK-NEXT: ldr x9, [x11, #16]
3535
; CHECK-NEXT: cbnz x11, .LBB0_1
@@ -97,8 +97,8 @@ define i64 @bfis_in_loop_undef() {
9797
; CHECK-NEXT: ldr x11, [x9, #8]
9898
; CHECK-NEXT: and x9, x10, #0xff
9999
; CHECK-NEXT: and x10, x0, #0xffffffff00000000
100-
; CHECK-NEXT: bfi x9, x8, #8, #32
101-
; CHECK-NEXT: bfi x10, x12, #16, #1
100+
; CHECK-NEXT: orr x9, x9, x8, lsl #8
101+
; CHECK-NEXT: orr x10, x10, x12, lsl #16
102102
; CHECK-NEXT: orr x0, x10, x9
103103
; CHECK-NEXT: ldr x9, [x11, #16]
104104
; CHECK-NEXT: cbnz x11, .LBB1_1

Diff for: llvm/test/CodeGen/AArch64/bitfield-insert.ll

+7-10
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,7 @@ define i32 @test_nouseful_bits(i8 %a, i32 %b) {
269269
; CHECK-NEXT: lsl w8, w8, #8
270270
; CHECK-NEXT: mov w9, w8
271271
; CHECK-NEXT: bfxil w9, w0, #0, #8
272-
; CHECK-NEXT: bfi w8, w9, #16, #16
273-
; CHECK-NEXT: mov w0, w8
272+
; CHECK-NEXT: orr w0, w8, w9, lsl #16
274273
; CHECK-NEXT: ret
275274
%conv = zext i8 %a to i32 ; 0 0 0 A
276275
%shl = shl i32 %b, 8 ; B2 B1 B0 0
@@ -612,10 +611,9 @@ define i64 @test_and_extended_shift_with_imm(i64 %0) {
612611
define i64 @test_orr_not_bfxil_i64(i64 %0) {
613612
; CHECK-LABEL: test_orr_not_bfxil_i64:
614613
; CHECK: // %bb.0:
615-
; CHECK-NEXT: lsr x8, x0, #1
616-
; CHECK-NEXT: and x8, x8, #0x3f80
617-
; CHECK-NEXT: bfxil x8, x0, #0, #7
618-
; CHECK-NEXT: mov x0, x8
614+
; CHECK-NEXT: ubfx x8, x0, #8, #7
615+
; CHECK-NEXT: and x9, x0, #0x7f
616+
; CHECK-NEXT: orr x0, x9, x8, lsl #7
619617
; CHECK-NEXT: ret
620618
%2 = and i64 %0, 127
621619
%3 = lshr i64 %0, 1
@@ -628,10 +626,9 @@ define i64 @test_orr_not_bfxil_i64(i64 %0) {
628626
define i32 @test_orr_not_bfxil_i32(i32 %0) {
629627
; CHECK-LABEL: test_orr_not_bfxil_i32:
630628
; CHECK: // %bb.0:
631-
; CHECK-NEXT: lsr w8, w0, #1
632-
; CHECK-NEXT: and w8, w8, #0x3f80
633-
; CHECK-NEXT: bfxil w8, w0, #0, #7
634-
; CHECK-NEXT: mov w0, w8
629+
; CHECK-NEXT: ubfx w8, w0, #8, #7
630+
; CHECK-NEXT: and w9, w0, #0x7f
631+
; CHECK-NEXT: orr w0, w9, w8, lsl #7
635632
; CHECK-NEXT: ret
636633
%2 = and i32 %0, 127
637634
%3 = lshr i32 %0, 1

Diff for: llvm/test/CodeGen/AArch64/build-pair-isel.ll

+1-3
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,11 @@ define void @compare_and_swap128() {
1414
; CHECK-NEXT: mov w9, w10
1515
; CHECK-NEXT: mov w8, w8
1616
; CHECK-NEXT: // kill: def $x8 killed $w8
17-
; CHECK-NEXT: bfi x8, x9, #32, #32
17+
; CHECK-NEXT: orr x8, x8, x9, lsl #32
1818
; CHECK-NEXT: // implicit-def: $x9
1919
; CHECK-NEXT: str x8, [x9]
2020
; CHECK-NEXT: ret
2121
%1 = call i128 asm sideeffect "nop", "=r,~{memory}"()
2222
store i128 %1, i128* undef, align 16
2323
ret void
2424
}
25-
26-

Diff for: llvm/test/CodeGen/AArch64/funnel-shift-rot.ll

+1-2
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@ define i8 @rotl_i8_const_shift(i8 %x) {
1919
; CHECK-LABEL: rotl_i8_const_shift:
2020
; CHECK: // %bb.0:
2121
; CHECK-NEXT: ubfx w8, w0, #5, #3
22-
; CHECK-NEXT: bfi w8, w0, #3, #29
23-
; CHECK-NEXT: mov w0, w8
22+
; CHECK-NEXT: orr w0, w8, w0, lsl #3
2423
; CHECK-NEXT: ret
2524
%f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
2625
ret i8 %f

0 commit comments

Comments
 (0)