Skip to content

Commit 1468822

Browse files
committed
[GlobalISel][Localizer] Rewrite localizer to run in 2 phases, inter & intra block.
Inter-block localization is the same as what currently happens, except now it only runs on the entry block because that's where the problematic constants with long live ranges come from. The second phase is a new intra-block localization phase which attempts to re-sink the already localized instructions further right before one of the multiple uses. One additional change is to also localize G_GLOBAL_VALUE as they're constants too. However, on some targets like arm64 it takes multiple instructions to materialize the value, so some additional heuristics with a TTI hook have been introduced attempt to prevent code size regressions when localizing these. Overall, these changes improve CTMark code size on arm64 by 1.2%. Full code size results: Program baseline new diff ------------------------------------------------------------------------------ test-suite...-typeset/consumer-typeset.test 1249984 1217216 -2.6% test-suite...:: CTMark/ClamAV/clamscan.test 1264928 1232152 -2.6% test-suite :: CTMark/SPASS/SPASS.test 1394092 1361316 -2.4% test-suite...Mark/mafft/pairlocalalign.test 731320 714928 -2.2% test-suite :: CTMark/lencod/lencod.test 1340592 1324200 -1.2% test-suite :: CTMark/kimwitu++/kc.test 3853512 3820420 -0.9% test-suite :: CTMark/Bullet/bullet.test 3406036 3389652 -0.5% test-suite...ark/tramp3d-v4/tramp3d-v4.test 8017000 8016992 -0.0% test-suite...TMark/7zip/7zip-benchmark.test 2856588 2856588 0.0% test-suite...:: CTMark/sqlite3/sqlite3.test 765704 765704 0.0% Geomean difference -1.2% Differential Revision: https://reviews.llvm.org/D63303 llvm-svn: 363632
1 parent f9bff2a commit 1468822

File tree

8 files changed

+341
-62
lines changed

8 files changed

+341
-62
lines changed

Diff for: llvm/include/llvm/Analysis/TargetTransformInfo.h

+11
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,11 @@ class TargetTransformInfo {
10531053
/// \returns True if the target wants to expand the given reduction intrinsic
10541054
/// into a shuffle sequence.
10551055
bool shouldExpandReduction(const IntrinsicInst *II) const;
1056+
1057+
/// \returns the size cost of rematerializing a GlobalValue address relative
1058+
/// to a stack reload.
1059+
unsigned getGISelRematGlobalCost() const;
1060+
10561061
/// @}
10571062

10581063
private:
@@ -1269,6 +1274,7 @@ class TargetTransformInfo::Concept {
12691274
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
12701275
ReductionFlags) const = 0;
12711276
virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
1277+
virtual unsigned getGISelRematGlobalCost() const = 0;
12721278
virtual int getInstructionLatency(const Instruction *I) = 0;
12731279
};
12741280

@@ -1701,6 +1707,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
17011707
bool shouldExpandReduction(const IntrinsicInst *II) const override {
17021708
return Impl.shouldExpandReduction(II);
17031709
}
1710+
1711+
unsigned getGISelRematGlobalCost() const override {
1712+
return Impl.getGISelRematGlobalCost();
1713+
}
1714+
17041715
int getInstructionLatency(const Instruction *I) override {
17051716
return Impl.getInstructionLatency(I);
17061717
}

Diff for: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

+4
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,10 @@ class TargetTransformInfoImplBase {
572572
return true;
573573
}
574574

575+
unsigned getGISelRematGlobalCost() const {
576+
return 1;
577+
}
578+
575579
protected:
576580
// Obtain the minimum required size to hold the value (without the sign)
577581
// In case of a vector it returns the min required size for one element.

Diff for: llvm/include/llvm/CodeGen/GlobalISel/Localizer.h

+11-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
namespace llvm {
2828
// Forward declarations.
2929
class MachineRegisterInfo;
30+
class TargetTransformInfo;
3031

3132
/// This pass implements the localization mechanism described at the
3233
/// top of this file. One specificity of the implementation is that
@@ -43,9 +44,11 @@ class Localizer : public MachineFunctionPass {
4344
/// MRI contains all the register class/bank information that this
4445
/// pass uses and updates.
4546
MachineRegisterInfo *MRI;
47+
/// TTI used for getting remat costs for instructions.
48+
TargetTransformInfo *TTI;
4649

4750
/// Check whether or not \p MI needs to be moved close to its uses.
48-
static bool shouldLocalize(const MachineInstr &MI);
51+
bool shouldLocalize(const MachineInstr &MI);
4952

5053
/// Check if \p MOUse is used in the same basic block as \p Def.
5154
/// If the use is in the same block, we say it is local.
@@ -57,6 +60,13 @@ class Localizer : public MachineFunctionPass {
5760
/// Initialize the field members using \p MF.
5861
void init(MachineFunction &MF);
5962

63+
/// Do inter-block localization from the entry block.
64+
bool localizeInterBlock(MachineFunction &MF,
65+
SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs);
66+
67+
/// Do intra-block localization of already localized instructions.
68+
bool localizeIntraBlock(SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs);
69+
6070
public:
6171
Localizer();
6272

Diff for: llvm/lib/Analysis/TargetTransformInfo.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,10 @@ bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
724724
return TTIImpl->shouldExpandReduction(II);
725725
}
726726

727+
unsigned TargetTransformInfo::getGISelRematGlobalCost() const {
728+
return TTIImpl->getGISelRematGlobalCost();
729+
}
730+
727731
int TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
728732
return TTIImpl->getInstructionLatency(I);
729733
}

Diff for: llvm/lib/CodeGen/GlobalISel/Localizer.cpp

+158-61
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//===----------------------------------------------------------------------===//
1111

1212
#include "llvm/CodeGen/GlobalISel/Localizer.h"
13+
#include "llvm/Analysis/TargetTransformInfo.h"
1314
#include "llvm/ADT/DenseMap.h"
1415
#include "llvm/ADT/SmallPtrSet.h"
1516
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -20,17 +21,55 @@
2021
using namespace llvm;
2122

2223
char Localizer::ID = 0;
23-
INITIALIZE_PASS(Localizer, DEBUG_TYPE,
24-
"Move/duplicate certain instructions close to their use", false,
25-
false)
24+
INITIALIZE_PASS_BEGIN(Localizer, DEBUG_TYPE,
25+
"Move/duplicate certain instructions close to their use",
26+
false, false)
27+
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
28+
INITIALIZE_PASS_END(Localizer, DEBUG_TYPE,
29+
"Move/duplicate certain instructions close to their use",
30+
false, false)
2631

2732
Localizer::Localizer() : MachineFunctionPass(ID) {
2833
initializeLocalizerPass(*PassRegistry::getPassRegistry());
2934
}
3035

31-
void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); }
36+
void Localizer::init(MachineFunction &MF) {
37+
MRI = &MF.getRegInfo();
38+
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
39+
}
3240

3341
bool Localizer::shouldLocalize(const MachineInstr &MI) {
42+
// Assuming a spill and reload of a value has a cost of 1 instruction each,
43+
// this helper function computes the maximum number of uses we should consider
44+
// for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
45+
// break even in terms of code size when the original MI has 2 users vs
46+
// choosing to potentially spill. Any more than 2 users we we have a net code
47+
// size increase. This doesn't take into account register pressure though.
48+
auto maxUses = [](unsigned RematCost) {
49+
// A cost of 1 means remats are basically free.
50+
if (RematCost == 1)
51+
return UINT_MAX;
52+
if (RematCost == 2)
53+
return 2U;
54+
55+
// Remat is too expensive, only sink if there's one user.
56+
if (RematCost > 2)
57+
return 1U;
58+
llvm_unreachable("Unexpected remat cost");
59+
};
60+
61+
// Helper to walk through uses and terminate if we've reached a limit. Saves
62+
// us spending time traversing uses if all we want to know is if it's >= min.
63+
auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
64+
unsigned NumUses = 0;
65+
auto UI = MRI->use_instr_nodbg_begin(Reg), UE = MRI->use_instr_nodbg_end();
66+
for (; UI != UE && NumUses < MaxUses; ++UI) {
67+
NumUses++;
68+
}
69+
// If we haven't reached the end yet then there are more than MaxUses users.
70+
return UI == UE;
71+
};
72+
3473
switch (MI.getOpcode()) {
3574
default:
3675
return false;
@@ -40,10 +79,20 @@ bool Localizer::shouldLocalize(const MachineInstr &MI) {
4079
case TargetOpcode::G_FCONSTANT:
4180
case TargetOpcode::G_FRAME_INDEX:
4281
return true;
82+
case TargetOpcode::G_GLOBAL_VALUE: {
83+
unsigned RematCost = TTI->getGISelRematGlobalCost();
84+
unsigned Reg = MI.getOperand(0).getReg();
85+
unsigned MaxUses = maxUses(RematCost);
86+
if (MaxUses == UINT_MAX)
87+
return true; // Remats are "free" so always localize.
88+
bool B = isUsesAtMost(Reg, MaxUses);
89+
return B;
90+
}
4391
}
4492
}
4593

4694
void Localizer::getAnalysisUsage(AnalysisUsage &AU) const {
95+
AU.addRequired<TargetTransformInfoWrapperPass>();
4796
getSelectionDAGFallbackAnalysisUsage(AU);
4897
MachineFunctionPass::getAnalysisUsage(AU);
4998
}
@@ -57,6 +106,106 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
57106
return InsertMBB == Def.getParent();
58107
}
59108

109+
bool Localizer::localizeInterBlock(
110+
MachineFunction &MF, SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs) {
111+
bool Changed = false;
112+
DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;
113+
114+
// Since the IRTranslator only emits constants into the entry block, and the
115+
// rest of the GISel pipeline generally emits constants close to their users,
116+
// we only localize instructions in the entry block here. This might change if
117+
// we start doing CSE across blocks.
118+
auto &MBB = MF.front();
119+
for (MachineInstr &MI : MBB) {
120+
if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
121+
continue;
122+
LLVM_DEBUG(dbgs() << "Should localize: " << MI);
123+
assert(MI.getDesc().getNumDefs() == 1 &&
124+
"More than one definition not supported yet");
125+
unsigned Reg = MI.getOperand(0).getReg();
126+
// Check if all the users of MI are local.
127+
// We are going to invalidation the list of use operands, so we
128+
// can't use range iterator.
129+
for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end();
130+
MOIt != MOItEnd;) {
131+
MachineOperand &MOUse = *MOIt++;
132+
// Check if the use is already local.
133+
MachineBasicBlock *InsertMBB;
134+
LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
135+
dbgs() << "Checking use: " << MIUse
136+
<< " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
137+
if (isLocalUse(MOUse, MI, InsertMBB))
138+
continue;
139+
LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
140+
Changed = true;
141+
auto MBBAndReg = std::make_pair(InsertMBB, Reg);
142+
auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
143+
if (NewVRegIt == MBBWithLocalDef.end()) {
144+
// Create the localized instruction.
145+
MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
146+
LocalizedInstrs.insert(LocalizedMI);
147+
MachineInstr &UseMI = *MOUse.getParent();
148+
if (MRI->hasOneUse(Reg) && !UseMI.isPHI())
149+
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(UseMI), LocalizedMI);
150+
else
151+
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()),
152+
LocalizedMI);
153+
154+
// Set a new register for the definition.
155+
unsigned NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
156+
MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
157+
LocalizedMI->getOperand(0).setReg(NewReg);
158+
NewVRegIt =
159+
MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
160+
LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
161+
}
162+
LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second)
163+
<< '\n');
164+
// Update the user reg.
165+
MOUse.setReg(NewVRegIt->second);
166+
}
167+
}
168+
return Changed;
169+
}
170+
171+
bool Localizer::localizeIntraBlock(
172+
SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs) {
173+
bool Changed = false;
174+
175+
// For each already-localized instruction which has multiple users, then we
176+
// scan the block top down from the current position until we hit one of them.
177+
178+
// FIXME: Consider doing inst duplication if live ranges are very long due to
179+
// many users, but this case may be better served by regalloc improvements.
180+
181+
for (MachineInstr *MI : LocalizedInstrs) {
182+
unsigned Reg = MI->getOperand(0).getReg();
183+
MachineBasicBlock &MBB = *MI->getParent();
184+
// If the instruction has a single use, we would have already moved it right
185+
// before its user in localizeInterBlock().
186+
if (MRI->hasOneUse(Reg))
187+
continue;
188+
189+
// All of the user MIs of this reg.
190+
SmallPtrSet<MachineInstr *, 32> Users;
191+
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg))
192+
Users.insert(&UseMI);
193+
194+
MachineBasicBlock::iterator II(MI);
195+
++II;
196+
while (II != MBB.end() && !Users.count(&*II))
197+
++II;
198+
199+
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *&*II
200+
<< "\n");
201+
assert(II != MBB.end() && "Didn't find the user in the MBB");
202+
MI->removeFromParent();
203+
MBB.insert(II, MI);
204+
Changed = true;
205+
}
206+
return Changed;
207+
}
208+
60209
bool Localizer::runOnMachineFunction(MachineFunction &MF) {
61210
// If the ISel pipeline failed, do not bother running that pass.
62211
if (MF.getProperties().hasProperty(
@@ -67,62 +216,10 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
67216

68217
init(MF);
69218

70-
bool Changed = false;
71-
// Keep track of the instructions we localized.
72-
// We won't need to process them if we see them later in the CFG.
73-
SmallPtrSet<MachineInstr *, 16> LocalizedInstrs;
74-
DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;
75-
// TODO: Do bottom up traversal.
76-
for (MachineBasicBlock &MBB : MF) {
77-
for (MachineInstr &MI : MBB) {
78-
if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
79-
continue;
80-
LLVM_DEBUG(dbgs() << "Should localize: " << MI);
81-
assert(MI.getDesc().getNumDefs() == 1 &&
82-
"More than one definition not supported yet");
83-
unsigned Reg = MI.getOperand(0).getReg();
84-
// Check if all the users of MI are local.
85-
// We are going to invalidation the list of use operands, so we
86-
// can't use range iterator.
87-
for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end();
88-
MOIt != MOItEnd;) {
89-
MachineOperand &MOUse = *MOIt++;
90-
// Check if the use is already local.
91-
MachineBasicBlock *InsertMBB;
92-
LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
93-
dbgs() << "Checking use: " << MIUse
94-
<< " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
95-
if (isLocalUse(MOUse, MI, InsertMBB))
96-
continue;
97-
LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
98-
Changed = true;
99-
auto MBBAndReg = std::make_pair(InsertMBB, Reg);
100-
auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
101-
if (NewVRegIt == MBBWithLocalDef.end()) {
102-
// Create the localized instruction.
103-
MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
104-
LocalizedInstrs.insert(LocalizedMI);
105-
// Don't try to be smart for the insertion point.
106-
// There is no guarantee that the first seen use is the first
107-
// use in the block.
108-
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()),
109-
LocalizedMI);
219+
// Keep track of the instructions we localized. We'll do a second pass of
220+
// intra-block localization to further reduce live ranges.
221+
SmallPtrSet<MachineInstr *, 32> LocalizedInstrs;
110222

111-
// Set a new register for the definition.
112-
unsigned NewReg =
113-
MRI->createGenericVirtualRegister(MRI->getType(Reg));
114-
MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
115-
LocalizedMI->getOperand(0).setReg(NewReg);
116-
NewVRegIt =
117-
MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
118-
LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
119-
}
120-
LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second)
121-
<< '\n');
122-
// Update the user reg.
123-
MOUse.setReg(NewVRegIt->second);
124-
}
125-
}
126-
}
127-
return Changed;
223+
bool Changed = localizeInterBlock(MF, LocalizedInstrs);
224+
return Changed |= localizeIntraBlock(LocalizedInstrs);
128225
}

Diff for: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

+4
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
165165
return false;
166166
}
167167

168+
unsigned getGISelRematGlobalCost() const {
169+
return 2;
170+
}
171+
168172
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
169173
TTI::ReductionFlags Flags) const;
170174

0 commit comments

Comments
 (0)