@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
62
62
" Number of load/store from unscaled generated" );
63
63
STATISTIC (NumZeroStoresPromoted, " Number of narrow zero stores promoted" );
64
64
STATISTIC (NumLoadsFromStoresPromoted, " Number of loads from stores promoted" );
65
+ STATISTIC (NumConstOffsetFolded,
66
+ " Number of const offset of index address folded" );
65
67
66
68
DEBUG_COUNTER (RegRenamingCounter, DEBUG_TYPE " -reg-renaming" ,
67
69
" Controls which pairs are considered for renaming" );
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
75
77
static cl::opt<unsigned > UpdateLimit (" aarch64-update-scan-limit" , cl::init(100 ),
76
78
cl::Hidden);
77
79
80
+ // The LdStConstLimit limits how far we search for const offset instructions
81
+ // when we form index address load/store instructions.
82
+ static cl::opt<unsigned > LdStConstLimit (" aarch64-load-store-const-scan-limit" ,
83
+ cl::init (10 ), cl::Hidden);
84
+
78
85
// Enable register renaming to find additional store pairing opportunities.
79
86
static cl::opt<bool > EnableRenaming (" aarch64-load-store-renaming" ,
80
87
cl::init (true ), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
171
178
findMatchingUpdateInsnForward (MachineBasicBlock::iterator I,
172
179
int UnscaledOffset, unsigned Limit);
173
180
181
+ // Scan the instruction list to find a register assigned with a const
182
+ // value that can be combined with the current instruction (a load or store)
183
+ // using base addressing with writeback. Scan forwards.
184
+ MachineBasicBlock::iterator
185
+ findMatchingConstOffsetBackward (MachineBasicBlock::iterator I, unsigned Limit,
186
+ unsigned &Offset);
187
+
174
188
// Scan the instruction list to find a base register update that can
175
189
// be combined with the current instruction (a load or store) using
176
190
// pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
182
196
bool isMatchingUpdateInsn (MachineInstr &MemMI, MachineInstr &MI,
183
197
unsigned BaseReg, int Offset);
184
198
199
+ bool isMatchingMovConstInsn (MachineInstr &MemMI, MachineInstr &MI,
200
+ unsigned IndexReg, unsigned &Offset);
201
+
185
202
// Merge a pre- or post-index base register update into a ld/st instruction.
186
203
MachineBasicBlock::iterator
187
204
mergeUpdateInsn (MachineBasicBlock::iterator I,
188
205
MachineBasicBlock::iterator Update, bool IsPreIdx);
189
206
207
+ MachineBasicBlock::iterator
208
+ mergeConstOffsetInsn (MachineBasicBlock::iterator I,
209
+ MachineBasicBlock::iterator Update, unsigned Offset,
210
+ int Scale);
211
+
190
212
// Find and merge zero store instructions.
191
213
bool tryToMergeZeroStInst (MachineBasicBlock::iterator &MBBI);
192
214
@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
199
221
// Find and merge a base register updates before or after a ld/st instruction.
200
222
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
201
223
224
+ // Find and merge a index ldr/st instructions into a base ld/st instruction.
225
+ bool tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI, int Scale);
226
+
202
227
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203
228
204
229
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
481
506
}
482
507
}
483
508
509
+ static unsigned getBaseAddressOpcode (unsigned Opc) {
510
+ // TODO: Add more index address loads/stores.
511
+ switch (Opc) {
512
+ default :
513
+ llvm_unreachable (" Opcode has no base address equivalent!" );
514
+ case AArch64::LDRBBroX:
515
+ return AArch64::LDRBBui;
516
+ }
517
+ }
518
+
484
519
static unsigned getPostIndexedOpcode (unsigned Opc) {
485
520
switch (Opc) {
486
521
default :
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
722
757
}
723
758
}
724
759
760
+ // Make sure this is a reg+reg Ld/St
761
+ static bool isMergeableIndexLdSt (MachineInstr &MI, int &Scale) {
762
+ unsigned Opc = MI.getOpcode ();
763
+ switch (Opc) {
764
+ default :
765
+ return false ;
766
+ // Scaled instructions.
767
+ // TODO: Add more index address loads/stores.
768
+ case AArch64::LDRBBroX:
769
+ Scale = 1 ;
770
+ return true ;
771
+ }
772
+ }
773
+
725
774
static bool isRewritableImplicitDef (unsigned Opc) {
726
775
switch (Opc) {
727
776
default :
@@ -2018,6 +2067,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
2018
2067
return NextI;
2019
2068
}
2020
2069
2070
+ MachineBasicBlock::iterator
2071
+ AArch64LoadStoreOpt::mergeConstOffsetInsn (MachineBasicBlock::iterator I,
2072
+ MachineBasicBlock::iterator Update,
2073
+ unsigned Offset, int Scale) {
2074
+ assert ((Update->getOpcode () == AArch64::MOVKWi) &&
2075
+ " Unexpected const mov instruction to merge!" );
2076
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2077
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2078
+ MachineBasicBlock::iterator PrevI = prev_nodbg (Update, E);
2079
+ MachineInstr &MemMI = *I;
2080
+ unsigned Mask = (1 << 12 ) * Scale - 1 ;
2081
+ unsigned Low = Offset & Mask;
2082
+ unsigned High = Offset - Low;
2083
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp (MemMI).getReg ();
2084
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2085
+ MachineInstrBuilder AddMIB, MemMIB;
2086
+
2087
+ // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2088
+ AddMIB =
2089
+ BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (AArch64::ADDXri))
2090
+ .addDef (IndexReg)
2091
+ .addUse (BaseReg)
2092
+ .addImm (High >> 12 ) // shifted value
2093
+ .addImm (12 ); // shift 12
2094
+ (void )AddMIB;
2095
+ // Ld/St DestReg, IndexReg, Imm12
2096
+ unsigned NewOpc = getBaseAddressOpcode (I->getOpcode ());
2097
+ MemMIB = BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (NewOpc))
2098
+ .add (getLdStRegOp (MemMI))
2099
+ .add (AArch64InstrInfo::getLdStOffsetOp (MemMI))
2100
+ .addImm (Low / Scale)
2101
+ .setMemRefs (I->memoperands ())
2102
+ .setMIFlags (I->mergeFlagsWith (*Update));
2103
+ (void )MemMIB;
2104
+
2105
+ ++NumConstOffsetFolded;
2106
+ LLVM_DEBUG (dbgs () << " Creating base address load/store.\n " );
2107
+ LLVM_DEBUG (dbgs () << " Replacing instructions:\n " );
2108
+ LLVM_DEBUG (PrevI->print (dbgs ()));
2109
+ LLVM_DEBUG (dbgs () << " " );
2110
+ LLVM_DEBUG (Update->print (dbgs ()));
2111
+ LLVM_DEBUG (dbgs () << " " );
2112
+ LLVM_DEBUG (I->print (dbgs ()));
2113
+ LLVM_DEBUG (dbgs () << " with instruction:\n " );
2114
+ LLVM_DEBUG (((MachineInstr *)AddMIB)->print (dbgs ()));
2115
+ LLVM_DEBUG (dbgs () << " " );
2116
+ LLVM_DEBUG (((MachineInstr *)MemMIB)->print (dbgs ()));
2117
+ LLVM_DEBUG (dbgs () << " \n " );
2118
+
2119
+ // Erase the old instructions for the block.
2120
+ I->eraseFromParent ();
2121
+ PrevI->eraseFromParent ();
2122
+ Update->eraseFromParent ();
2123
+
2124
+ return NextI;
2125
+ }
2126
+
2021
2127
bool AArch64LoadStoreOpt::isMatchingUpdateInsn (MachineInstr &MemMI,
2022
2128
MachineInstr &MI,
2023
2129
unsigned BaseReg, int Offset) {
@@ -2065,6 +2171,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2065
2171
return false ;
2066
2172
}
2067
2173
2174
+ bool AArch64LoadStoreOpt::isMatchingMovConstInsn (MachineInstr &MemMI,
2175
+ MachineInstr &MI,
2176
+ unsigned IndexReg,
2177
+ unsigned &Offset) {
2178
+ // The update instruction source and destination register must be the
2179
+ // same as the load/store index register.
2180
+ if (MI.getOpcode () == AArch64::MOVKWi &&
2181
+ TRI->isSuperOrSubRegisterEq (IndexReg, MI.getOperand (1 ).getReg ())) {
2182
+
2183
+ // movz + movk hold a large offset of a Ld/St instruction.
2184
+ MachineBasicBlock::iterator B = MI.getParent ()->begin ();
2185
+ MachineBasicBlock::iterator MBBI = &MI;
2186
+ MBBI = prev_nodbg (MBBI, B);
2187
+ MachineInstr &MovzMI = *MBBI;
2188
+ if (MovzMI.getOpcode () == AArch64::MOVZWi) {
2189
+ unsigned Low = MovzMI.getOperand (1 ).getImm ();
2190
+ unsigned High = MI.getOperand (2 ).getImm () << MI.getOperand (3 ).getImm ();
2191
+ Offset = High + Low;
2192
+ // 12-bit optionally shifted immediates are legal for adds.
2193
+ return Offset >> 24 == 0 ;
2194
+ }
2195
+ }
2196
+ return false ;
2197
+ }
2198
+
2068
2199
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward (
2069
2200
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2070
2201
MachineBasicBlock::iterator E = I->getParent ()->end ();
@@ -2220,6 +2351,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2220
2351
return E;
2221
2352
}
2222
2353
2354
+ MachineBasicBlock::iterator
2355
+ AArch64LoadStoreOpt::findMatchingConstOffsetBackward (
2356
+ MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2357
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2358
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2359
+ MachineInstr &MemMI = *I;
2360
+ MachineBasicBlock::iterator MBBI = I;
2361
+
2362
+ // If the load is the first instruction in the block, there's obviously
2363
+ // not any matching load or store.
2364
+ if (MBBI == B)
2365
+ return E;
2366
+
2367
+ // Make sure the IndexReg is killed and the shift amount is zero.
2368
+ // TODO: Relex this restriction to extend, simplify processing now.
2369
+ if (!AArch64InstrInfo::getLdStOffsetOp (MemMI).isKill () ||
2370
+ !AArch64InstrInfo::getLdStAmountOp (MemMI).isImm () ||
2371
+ (AArch64InstrInfo::getLdStAmountOp (MemMI).getImm () != 0 ))
2372
+ return E;
2373
+
2374
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2375
+
2376
+ // Track which register units have been modified and used between the first
2377
+ // insn (inclusive) and the second insn.
2378
+ ModifiedRegUnits.clear ();
2379
+ UsedRegUnits.clear ();
2380
+ unsigned Count = 0 ;
2381
+ do {
2382
+ MBBI = prev_nodbg (MBBI, B);
2383
+ MachineInstr &MI = *MBBI;
2384
+
2385
+ // Don't count transient instructions towards the search limit since there
2386
+ // may be different numbers of them if e.g. debug information is present.
2387
+ if (!MI.isTransient ())
2388
+ ++Count;
2389
+
2390
+ // If we found a match, return it.
2391
+ if (isMatchingMovConstInsn (*I, MI, IndexReg, Offset)) {
2392
+ return MBBI;
2393
+ }
2394
+
2395
+ // Update the status of what the instruction clobbered and used.
2396
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits, TRI);
2397
+
2398
+ // Otherwise, if the index register is used or modified, we have no match,
2399
+ // so return early.
2400
+ if (!ModifiedRegUnits.available (IndexReg) ||
2401
+ !UsedRegUnits.available (IndexReg))
2402
+ return E;
2403
+
2404
+ } while (MBBI != B && Count < Limit);
2405
+ return E;
2406
+ }
2407
+
2223
2408
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2224
2409
MachineBasicBlock::iterator &MBBI) {
2225
2410
MachineInstr &MI = *MBBI;
@@ -2404,6 +2589,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2404
2589
return false ;
2405
2590
}
2406
2591
2592
+ bool AArch64LoadStoreOpt::tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI,
2593
+ int Scale) {
2594
+ MachineInstr &MI = *MBBI;
2595
+ MachineBasicBlock::iterator E = MI.getParent ()->end ();
2596
+ MachineBasicBlock::iterator Update;
2597
+
2598
+ // Don't know how to handle unscaled pre/post-index versions below, so bail.
2599
+ if (TII->hasUnscaledLdStOffset (MI.getOpcode ()))
2600
+ return false ;
2601
+
2602
+ // Look back to try to find a const offset for index LdSt instruction. For
2603
+ // example,
2604
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2605
+ // ldr x1, [x0, x8]
2606
+ // merged into:
2607
+ // add x8, x0, a * (1<<12)
2608
+ // ldr x1, [x8, imm12]
2609
+ unsigned Offset;
2610
+ Update = findMatchingConstOffsetBackward (MBBI, LdStConstLimit, Offset);
2611
+ if (Update != E && (Offset & (Scale - 1 )) == 0 ) {
2612
+ // Merge the imm12 into the ld/st.
2613
+ MBBI = mergeConstOffsetInsn (MBBI, Update, Offset, Scale);
2614
+ return true ;
2615
+ }
2616
+
2617
+ return false ;
2618
+ }
2619
+
2407
2620
bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB,
2408
2621
bool EnableNarrowZeroStOpt) {
2409
2622
@@ -2482,6 +2695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2482
2695
++MBBI;
2483
2696
}
2484
2697
2698
+ // 5) Find a register assigned with a const value that can be combined with
2699
+ // into the load or store. e.g.,
2700
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2701
+ // ldr x1, [x0, x8]
2702
+ // ; becomes
2703
+ // add x8, x0, a * (1<<12)
2704
+ // ldr x1, [x8, imm12]
2705
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2706
+ MBBI != E;) {
2707
+ int Scale;
2708
+ if (isMergeableIndexLdSt (*MBBI, Scale) && tryToMergeIndexLdSt (MBBI, Scale))
2709
+ Modified = true ;
2710
+ else
2711
+ ++MBBI;
2712
+ }
2713
+
2485
2714
return Modified;
2486
2715
}
2487
2716
0 commit comments