Skip to content

Commit 32878c2

Browse files
committed
[AArch64] merge index address with large offset into base address
A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold mov w8, llvm#56952 movk w8, #15, lsl #16 ldrb w0, [x0, x8] into add x0, x0, 1036288 ldrb w0, [x0, 3704] Only LDRBBroX is supported for the first time. Fix llvm#71917
1 parent 4bad0cb commit 32878c2

File tree

5 files changed

+250
-12
lines changed

5 files changed

+250
-12
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -4089,6 +4089,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
40894089
return MI.getOperand(Idx);
40904090
}
40914091

4092+
const MachineOperand &
4093+
AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4094+
switch (MI.getOpcode()) {
4095+
default:
4096+
llvm_unreachable("Unexpected opcode");
4097+
case AArch64::LDRBBroX:
4098+
return MI.getOperand(4);
4099+
}
4100+
}
4101+
40924102
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
40934103
Register Reg) {
40944104
if (MI.getParent() == nullptr)

llvm/lib/Target/AArch64/AArch64InstrInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
111111
/// Returns the immediate offset operator of a load/store.
112112
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
113113

114+
/// Returns the shift amount operator of a load/store.
115+
static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
116+
114117
/// Returns whether the instruction is FP or NEON.
115118
static bool isFpOrNEON(const MachineInstr &MI);
116119

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+229
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
6262
"Number of load/store from unscaled generated");
6363
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
6464
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65+
STATISTIC(NumConstOffsetFolded,
66+
"Number of const offset of index address folded");
6567

6668
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
6769
"Controls which pairs are considered for renaming");
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
7577
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
7678
cl::Hidden);
7779

80+
// The LdStConstLimit limits how far we search for const offset instructions
81+
// when we form index address load/store instructions.
82+
static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
83+
cl::init(10), cl::Hidden);
84+
7885
// Enable register renaming to find additional store pairing opportunities.
7986
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
8087
cl::init(true), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
171178
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
172179
int UnscaledOffset, unsigned Limit);
173180

181+
// Scan the instruction list to find a register assigned with a const
182+
// value that can be combined with the current instruction (a load or store)
183+
// using base addressing with writeback. Scan forwards.
184+
MachineBasicBlock::iterator
185+
findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
186+
unsigned &Offset);
187+
174188
// Scan the instruction list to find a base register update that can
175189
// be combined with the current instruction (a load or store) using
176190
// pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
182196
bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
183197
unsigned BaseReg, int Offset);
184198

199+
bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
200+
unsigned IndexReg, unsigned &Offset);
201+
185202
// Merge a pre- or post-index base register update into a ld/st instruction.
186203
MachineBasicBlock::iterator
187204
mergeUpdateInsn(MachineBasicBlock::iterator I,
188205
MachineBasicBlock::iterator Update, bool IsPreIdx);
189206

207+
MachineBasicBlock::iterator
208+
mergeConstOffsetInsn(MachineBasicBlock::iterator I,
209+
MachineBasicBlock::iterator Update, unsigned Offset,
210+
int Scale);
211+
190212
// Find and merge zero store instructions.
191213
bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
192214

@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
199221
// Find and merge a base register updates before or after a ld/st instruction.
200222
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
201223

224+
// Find and merge a index ldr/st instructions into a base ld/st instruction.
225+
bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
226+
202227
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203228

204229
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
481506
}
482507
}
483508

509+
static unsigned getBaseAddressOpcode(unsigned Opc) {
510+
// TODO: Add more index address loads/stores.
511+
switch (Opc) {
512+
default:
513+
llvm_unreachable("Opcode has no base address equivalent!");
514+
case AArch64::LDRBBroX:
515+
return AArch64::LDRBBui;
516+
}
517+
}
518+
484519
static unsigned getPostIndexedOpcode(unsigned Opc) {
485520
switch (Opc) {
486521
default:
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
722757
}
723758
}
724759

760+
// Make sure this is a reg+reg Ld/St
761+
static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
762+
unsigned Opc = MI.getOpcode();
763+
switch (Opc) {
764+
default:
765+
return false;
766+
// Scaled instructions.
767+
// TODO: Add more index address loads/stores.
768+
case AArch64::LDRBBroX:
769+
Scale = 1;
770+
return true;
771+
}
772+
}
773+
725774
static bool isRewritableImplicitDef(unsigned Opc) {
726775
switch (Opc) {
727776
default:
@@ -2018,6 +2067,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
20182067
return NextI;
20192068
}
20202069

2070+
MachineBasicBlock::iterator
2071+
AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2072+
MachineBasicBlock::iterator Update,
2073+
unsigned Offset, int Scale) {
2074+
assert((Update->getOpcode() == AArch64::MOVKWi) &&
2075+
"Unexpected const mov instruction to merge!");
2076+
MachineBasicBlock::iterator E = I->getParent()->end();
2077+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2078+
MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
2079+
MachineInstr &MemMI = *I;
2080+
unsigned Mask = (1 << 12) * Scale - 1;
2081+
unsigned Low = Offset & Mask;
2082+
unsigned High = Offset - Low;
2083+
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2084+
Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2085+
MachineInstrBuilder AddMIB, MemMIB;
2086+
2087+
// Add IndexReg, BaseReg, High (the BaseReg may be SP)
2088+
AddMIB =
2089+
BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
2090+
.addDef(IndexReg)
2091+
.addUse(BaseReg)
2092+
.addImm(High >> 12) // shifted value
2093+
.addImm(12); // shift 12
2094+
(void)AddMIB;
2095+
// Ld/St DestReg, IndexReg, Imm12
2096+
unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
2097+
MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2098+
.add(getLdStRegOp(MemMI))
2099+
.add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
2100+
.addImm(Low / Scale)
2101+
.setMemRefs(I->memoperands())
2102+
.setMIFlags(I->mergeFlagsWith(*Update));
2103+
(void)MemMIB;
2104+
2105+
++NumConstOffsetFolded;
2106+
LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2107+
LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2108+
LLVM_DEBUG(PrevI->print(dbgs()));
2109+
LLVM_DEBUG(dbgs() << " ");
2110+
LLVM_DEBUG(Update->print(dbgs()));
2111+
LLVM_DEBUG(dbgs() << " ");
2112+
LLVM_DEBUG(I->print(dbgs()));
2113+
LLVM_DEBUG(dbgs() << " with instruction:\n ");
2114+
LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2115+
LLVM_DEBUG(dbgs() << " ");
2116+
LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2117+
LLVM_DEBUG(dbgs() << "\n");
2118+
2119+
// Erase the old instructions for the block.
2120+
I->eraseFromParent();
2121+
PrevI->eraseFromParent();
2122+
Update->eraseFromParent();
2123+
2124+
return NextI;
2125+
}
2126+
20212127
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
20222128
MachineInstr &MI,
20232129
unsigned BaseReg, int Offset) {
@@ -2065,6 +2171,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
20652171
return false;
20662172
}
20672173

2174+
bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2175+
MachineInstr &MI,
2176+
unsigned IndexReg,
2177+
unsigned &Offset) {
2178+
// The update instruction source and destination register must be the
2179+
// same as the load/store index register.
2180+
if (MI.getOpcode() == AArch64::MOVKWi &&
2181+
TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
2182+
2183+
// movz + movk hold a large offset of a Ld/St instruction.
2184+
MachineBasicBlock::iterator B = MI.getParent()->begin();
2185+
MachineBasicBlock::iterator MBBI = &MI;
2186+
MBBI = prev_nodbg(MBBI, B);
2187+
MachineInstr &MovzMI = *MBBI;
2188+
if (MovzMI.getOpcode() == AArch64::MOVZWi) {
2189+
unsigned Low = MovzMI.getOperand(1).getImm();
2190+
unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
2191+
Offset = High + Low;
2192+
// 12-bit optionally shifted immediates are legal for adds.
2193+
return Offset >> 24 == 0;
2194+
}
2195+
}
2196+
return false;
2197+
}
2198+
20682199
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
20692200
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
20702201
MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2220,6 +2351,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22202351
return E;
22212352
}
22222353

2354+
MachineBasicBlock::iterator
2355+
AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2356+
MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2357+
MachineBasicBlock::iterator B = I->getParent()->begin();
2358+
MachineBasicBlock::iterator E = I->getParent()->end();
2359+
MachineInstr &MemMI = *I;
2360+
MachineBasicBlock::iterator MBBI = I;
2361+
2362+
// If the load is the first instruction in the block, there's obviously
2363+
// not any matching load or store.
2364+
if (MBBI == B)
2365+
return E;
2366+
2367+
// Make sure the IndexReg is killed and the shift amount is zero.
2368+
// TODO: Relex this restriction to extend, simplify processing now.
2369+
if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
2370+
!AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
2371+
(AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
2372+
return E;
2373+
2374+
Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2375+
2376+
// Track which register units have been modified and used between the first
2377+
// insn (inclusive) and the second insn.
2378+
ModifiedRegUnits.clear();
2379+
UsedRegUnits.clear();
2380+
unsigned Count = 0;
2381+
do {
2382+
MBBI = prev_nodbg(MBBI, B);
2383+
MachineInstr &MI = *MBBI;
2384+
2385+
// Don't count transient instructions towards the search limit since there
2386+
// may be different numbers of them if e.g. debug information is present.
2387+
if (!MI.isTransient())
2388+
++Count;
2389+
2390+
// If we found a match, return it.
2391+
if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
2392+
return MBBI;
2393+
}
2394+
2395+
// Update the status of what the instruction clobbered and used.
2396+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2397+
2398+
// Otherwise, if the index register is used or modified, we have no match,
2399+
// so return early.
2400+
if (!ModifiedRegUnits.available(IndexReg) ||
2401+
!UsedRegUnits.available(IndexReg))
2402+
return E;
2403+
2404+
} while (MBBI != B && Count < Limit);
2405+
return E;
2406+
}
2407+
22232408
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22242409
MachineBasicBlock::iterator &MBBI) {
22252410
MachineInstr &MI = *MBBI;
@@ -2404,6 +2589,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
24042589
return false;
24052590
}
24062591

2592+
bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2593+
int Scale) {
2594+
MachineInstr &MI = *MBBI;
2595+
MachineBasicBlock::iterator E = MI.getParent()->end();
2596+
MachineBasicBlock::iterator Update;
2597+
2598+
// Don't know how to handle unscaled pre/post-index versions below, so bail.
2599+
if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2600+
return false;
2601+
2602+
// Look back to try to find a const offset for index LdSt instruction. For
2603+
// example,
2604+
// mov x8, #LargeImm ; = a * (1<<12) + imm12
2605+
// ldr x1, [x0, x8]
2606+
// merged into:
2607+
// add x8, x0, a * (1<<12)
2608+
// ldr x1, [x8, imm12]
2609+
unsigned Offset;
2610+
Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
2611+
if (Update != E && (Offset & (Scale - 1)) == 0) {
2612+
// Merge the imm12 into the ld/st.
2613+
MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
2614+
return true;
2615+
}
2616+
2617+
return false;
2618+
}
2619+
24072620
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
24082621
bool EnableNarrowZeroStOpt) {
24092622

@@ -2482,6 +2695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
24822695
++MBBI;
24832696
}
24842697

2698+
// 5) Find a register assigned with a const value that can be combined with
2699+
// into the load or store. e.g.,
2700+
// mov x8, #LargeImm ; = a * (1<<12) + imm12
2701+
// ldr x1, [x0, x8]
2702+
// ; becomes
2703+
// add x8, x0, a * (1<<12)
2704+
// ldr x1, [x8, imm12]
2705+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2706+
MBBI != E;) {
2707+
int Scale;
2708+
if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
2709+
Modified = true;
2710+
else
2711+
++MBBI;
2712+
}
2713+
24852714
return Modified;
24862715
}
24872716

llvm/test/CodeGen/AArch64/arm64-addrmode.ll

+6-9
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,8 @@ define void @t17(i64 %a) {
214214
define i8 @LdOffset_i8(ptr %a) {
215215
; CHECK-LABEL: LdOffset_i8:
216216
; CHECK: // %bb.0:
217-
; CHECK-NEXT: mov w8, #56952 // =0xde78
218-
; CHECK-NEXT: movk w8, #15, lsl #16
219-
; CHECK-NEXT: ldrb w0, [x0, x8]
217+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
218+
; CHECK-NEXT: ldrb w0, [x8, #3704]
220219
; CHECK-NEXT: ret
221220
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
222221
%val = load i8, ptr %arrayidx, align 1
@@ -227,9 +226,8 @@ define i8 @LdOffset_i8(ptr %a) {
227226
define i32 @LdOffset_i8_zext32(ptr %a) {
228227
; CHECK-LABEL: LdOffset_i8_zext32:
229228
; CHECK: // %bb.0:
230-
; CHECK-NEXT: mov w8, #56952 // =0xde78
231-
; CHECK-NEXT: movk w8, #15, lsl #16
232-
; CHECK-NEXT: ldrb w0, [x0, x8]
229+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
230+
; CHECK-NEXT: ldrb w0, [x8, #3704]
233231
; CHECK-NEXT: ret
234232
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
235233
%val = load i8, ptr %arrayidx, align 1
@@ -255,9 +253,8 @@ define i32 @LdOffset_i8_sext32(ptr %a) {
255253
define i64 @LdOffset_i8_zext64(ptr %a) {
256254
; CHECK-LABEL: LdOffset_i8_zext64:
257255
; CHECK: // %bb.0:
258-
; CHECK-NEXT: mov w8, #56952 // =0xde78
259-
; CHECK-NEXT: movk w8, #15, lsl #16
260-
; CHECK-NEXT: ldrb w0, [x0, x8]
256+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
257+
; CHECK-NEXT: ldrb w0, [x8, #3704]
261258
; CHECK-NEXT: ret
262259
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
263260
%val = load i8, ptr %arrayidx, align 1

llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir

+2-3
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,8 @@ body: |
1414
; CHECK-LABEL: name: LdOffset
1515
; CHECK: liveins: $x0
1616
; CHECK-NEXT: {{ $}}
17-
; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
18-
; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
19-
; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
17+
; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
18+
; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
2019
; CHECK-NEXT: RET undef $lr, implicit $w0
2120
renamable $w8 = MOVZWi 56952, 0
2221
renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8

0 commit comments

Comments
 (0)