Skip to content

Commit 966954c

Browse files
[FuncSpec] Update function specialization to handle phi-chains
When using the LLVM flang compiler with alias analysis (AA) enabled, SPEC2017:548.exchange2_r was running significantly slower than wihtout the AA. This was caused by the GVN pass replacing many of the loads in the pre-AA code with phi-nodes that form a long chain of dependencies, which the function specialization was unable to follow. This adds a function to follow phi-nodes when they are a strongly connected component, with some limitations to avoid spending ages analysing phi-nodes. The minimum latency savings also had to be lowered - fewer load instructions means less saving. Adding some more prints to help debugging the isProfitable decision. No significant change in compile time or generated code-size. Co-authored-by: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
1 parent f2d8a0a commit 966954c

File tree

3 files changed

+198
-26
lines changed

3 files changed

+198
-26
lines changed

llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
183183
DenseSet<BasicBlock *> DeadBlocks;
184184
// PHI nodes we have visited before.
185185
DenseSet<Instruction *> VisitedPHIs;
186+
// PHI nodes forming a strongly connected component.
187+
DenseSet<PHINode *> StronglyConnectedPHIs;
186188
// PHI nodes we have visited once without successfully constant folding them.
187189
// Once the InstCostVisitor has processed all the specialization arguments,
188190
// it should be possible to determine whether those PHIs can be folded
@@ -217,6 +219,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
217219
Cost estimateSwitchInst(SwitchInst &I);
218220
Cost estimateBranchInst(BranchInst &I);
219221

222+
void discoverStronglyConnectedComponent(PHINode *PN, unsigned Depth);
223+
220224
Constant *visitInstruction(Instruction &I) { return nullptr; }
221225
Constant *visitPHINode(PHINode &I);
222226
Constant *visitFreezeInst(FreezeInst &I);

llvm/lib/Transforms/IPO/FunctionSpecialization.cpp

Lines changed: 107 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,15 @@ static cl::opt<unsigned> MaxClones(
3939
"The maximum number of clones allowed for a single function "
4040
"specialization"));
4141

42+
static cl::opt<unsigned> MaxDiscoveryDepth(
43+
"funcspec-max-discovery-depth", cl::init(10), cl::Hidden,
44+
cl::desc("The maximum recursion depth allowed when searching for strongly "
45+
"connected phis"));
46+
4247
static cl::opt<unsigned> MaxIncomingPhiValues(
43-
"funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden, cl::desc(
44-
"The maximum number of incoming values a PHI node can have to be "
45-
"considered during the specialization bonus estimation"));
48+
"funcspec-max-incoming-phi-values", cl::init(8), cl::Hidden,
49+
cl::desc("The maximum number of incoming values a PHI node can have to be "
50+
"considered during the specialization bonus estimation"));
4651

4752
static cl::opt<unsigned> MaxBlockPredecessors(
4853
"funcspec-max-block-predecessors", cl::init(2), cl::Hidden, cl::desc(
@@ -64,9 +69,9 @@ static cl::opt<unsigned> MinCodeSizeSavings(
6469
"much percent of the original function size"));
6570

6671
static cl::opt<unsigned> MinLatencySavings(
67-
"funcspec-min-latency-savings", cl::init(70), cl::Hidden, cl::desc(
68-
"Reject specializations whose latency savings are less than this"
69-
"much percent of the original function size"));
72+
"funcspec-min-latency-savings", cl::init(45), cl::Hidden,
73+
cl::desc("Reject specializations whose latency savings are less than this"
74+
"much percent of the original function size"));
7075

7176
static cl::opt<unsigned> MinInliningBonus(
7277
"funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc(
@@ -262,30 +267,86 @@ Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
262267
return estimateBasicBlocks(WorkList);
263268
}
264269

270+
void InstCostVisitor::discoverStronglyConnectedComponent(PHINode *PN,
271+
unsigned Depth) {
272+
if (Depth > MaxDiscoveryDepth)
273+
return;
274+
275+
if (PN->getNumIncomingValues() > MaxIncomingPhiValues)
276+
return;
277+
278+
if (!StronglyConnectedPHIs.insert(PN).second)
279+
return;
280+
281+
for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
282+
Value *V = PN->getIncomingValue(I);
283+
if (auto *Phi = dyn_cast<PHINode>(V)) {
284+
if (Phi == PN || DeadBlocks.contains(PN->getIncomingBlock(I)))
285+
continue;
286+
discoverStronglyConnectedComponent(Phi, Depth + 1);
287+
}
288+
}
289+
}
290+
265291
Constant *InstCostVisitor::visitPHINode(PHINode &I) {
266292
if (I.getNumIncomingValues() > MaxIncomingPhiValues)
267293
return nullptr;
268294

269295
bool Inserted = VisitedPHIs.insert(&I).second;
270296
Constant *Const = nullptr;
297+
SmallVector<PHINode *, 8> UnknownIncomingValues;
271298

272-
for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) {
273-
Value *V = I.getIncomingValue(Idx);
274-
if (auto *Inst = dyn_cast<Instruction>(V))
275-
if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx)))
276-
continue;
277-
Constant *C = findConstantFor(V, KnownConstants);
278-
if (!C) {
279-
if (Inserted)
280-
PendingPHIs.push_back(&I);
281-
return nullptr;
299+
auto CanConstantFoldPhi = [&](PHINode *PN) -> bool {
300+
UnknownIncomingValues.clear();
301+
302+
for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
303+
Value *V = PN->getIncomingValue(I);
304+
305+
// Disregard self-references and dead incoming values.
306+
if (auto *Inst = dyn_cast<Instruction>(V))
307+
if (Inst == PN || DeadBlocks.contains(PN->getIncomingBlock(I)))
308+
continue;
309+
310+
if (Constant *C = findConstantFor(V, KnownConstants)) {
311+
if (!Const)
312+
Const = C;
313+
// Not all incoming values are the same constant. Bail immediately.
314+
else if (C != Const)
315+
return false;
316+
} else if (auto *Phi = dyn_cast<PHINode>(V)) {
317+
// It's not a strongly connected phi. Collect it and bail at the end.
318+
if (!StronglyConnectedPHIs.contains(Phi))
319+
UnknownIncomingValues.push_back(Phi);
320+
} else {
321+
// We can't reason about anything else.
322+
return false;
323+
}
324+
}
325+
return UnknownIncomingValues.empty();
326+
};
327+
328+
if (CanConstantFoldPhi(&I))
329+
return Const;
330+
331+
if (Inserted) {
332+
// First time we are seeing this phi. We'll retry later, after all
333+
// the constant arguments have been propagated. Bail for now.
334+
PendingPHIs.push_back(&I);
335+
return nullptr;
336+
}
337+
338+
for (PHINode *Phi : UnknownIncomingValues)
339+
discoverStronglyConnectedComponent(Phi, 1);
340+
341+
bool CannotConstantFoldPhi = false;
342+
for (PHINode *Phi : StronglyConnectedPHIs) {
343+
if (!CanConstantFoldPhi(Phi)) {
344+
CannotConstantFoldPhi = true;
345+
break;
282346
}
283-
if (!Const)
284-
Const = C;
285-
else if (C != Const)
286-
return nullptr;
287347
}
288-
return Const;
348+
StronglyConnectedPHIs.clear();
349+
return CannotConstantFoldPhi ? nullptr : Const;
289350
}
290351

291352
Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) {
@@ -809,20 +870,40 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
809870
auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
810871
unsigned FuncGrowth) -> bool {
811872
// No check required.
812-
if (ForceSpecialization)
873+
if (ForceSpecialization) {
874+
LLVM_DEBUG(dbgs() << "Force is on\n");
813875
return true;
876+
}
814877
// Minimum inlining bonus.
815-
if (Score > MinInliningBonus * FuncSize / 100)
878+
if (Score > MinInliningBonus * FuncSize / 100) {
879+
LLVM_DEBUG(dbgs()
880+
<< "FnSpecialization: Min inliningbous: Score = " << Score
881+
<< " > " << MinInliningBonus * FuncSize / 100 << "\n");
816882
return true;
883+
}
817884
// Minimum codesize savings.
818-
if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
885+
if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) {
886+
LLVM_DEBUG(dbgs()
887+
<< "FnSpecialization: Min CodeSize Saving: CodeSize = "
888+
<< B.CodeSize << " > "
889+
<< MinCodeSizeSavings * FuncSize / 100 << "\n");
819890
return false;
891+
}
820892
// Minimum latency savings.
821-
if (B.Latency < MinLatencySavings * FuncSize / 100)
893+
if (B.Latency < MinLatencySavings * FuncSize / 100) {
894+
LLVM_DEBUG(dbgs()
895+
<< "FnSpecialization: Min Latency Saving: Latency = "
896+
<< B.Latency << " > " << MinLatencySavings * FuncSize / 100
897+
<< "\n");
822898
return false;
899+
}
823900
// Maximum codesize growth.
824-
if (FuncGrowth / FuncSize > MaxCodeSizeGrowth)
901+
if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) {
902+
LLVM_DEBUG(dbgs() << "FnSpecialization: Max Func Growth: CodeSize = "
903+
<< FuncGrowth / FuncSize << " > "
904+
<< MaxCodeSizeGrowth << "\n");
825905
return false;
906+
}
826907
return true;
827908
};
828909

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
;
3+
; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -S < %s | FileCheck %s --check-prefix=FUNCSPEC
4+
; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -funcspec-max-discovery-depth=5 -S < %s | FileCheck %s --check-prefix=NOFUNCSPEC
5+
6+
define i64 @bar(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
7+
; FUNCSPEC-LABEL: define i64 @bar(
8+
; FUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
9+
; FUNCSPEC-NEXT: entry:
10+
; FUNCSPEC-NEXT: [[F1:%.*]] = call i64 @foo.specialized.1(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
11+
; FUNCSPEC-NEXT: [[F2:%.*]] = call i64 @foo.specialized.2(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG1:![0-9]+]]
12+
; FUNCSPEC-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
13+
; FUNCSPEC-NEXT: ret i64 [[ADD]]
14+
;
15+
; NOFUNCSPEC-LABEL: define i64 @bar(
16+
; NOFUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
17+
; NOFUNCSPEC-NEXT: entry:
18+
; NOFUNCSPEC-NEXT: [[F1:%.*]] = call i64 @foo(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
19+
; NOFUNCSPEC-NEXT: [[F2:%.*]] = call i64 @foo(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0]]
20+
; NOFUNCSPEC-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
21+
; NOFUNCSPEC-NEXT: ret i64 [[ADD]]
22+
;
23+
entry:
24+
%f1 = call i64 @foo(i64 3, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10)
25+
%f2 = call i64 @foo(i64 4, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10)
26+
%add = add i64 %f1, %f2
27+
ret i64 %add
28+
}
29+
30+
define internal i64 @foo(i64 %n, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
31+
entry:
32+
br i1 %c1, label %l1, label %l9
33+
34+
l1:
35+
%phi1 = phi i64 [ %n, %entry ], [ %phi2, %l2 ]
36+
%add = add i64 %phi1, 1
37+
%div = sdiv i64 %add, 2
38+
br i1 %c2, label %l1_5, label %exit
39+
40+
l1_5:
41+
br i1 %c3, label %l1_75, label %l6
42+
43+
l1_75:
44+
br i1 %c4, label %l2, label %l3
45+
46+
l2:
47+
%phi2 = phi i64 [ %phi1, %l1_75 ], [ %phi3, %l3 ]
48+
br label %l1
49+
50+
l3:
51+
%phi3 = phi i64 [ %phi1, %l1_75 ], [ %phi4, %l4 ]
52+
br label %l2
53+
54+
l4:
55+
%phi4 = phi i64 [ %phi5, %l5 ], [ %phi6, %l6 ]
56+
br i1 %c5, label %l3, label %l6
57+
58+
l5:
59+
%phi5 = phi i64 [ %phi6, %l6_5 ], [ %phi7, %l7 ]
60+
br label %l4
61+
62+
l6:
63+
%phi6 = phi i64 [ %phi4, %l4 ], [ %phi1, %l1_5 ]
64+
br i1 %c6, label %l4, label %l6_5
65+
66+
l6_5:
67+
br i1 %c7, label %l5, label %l8
68+
69+
l7:
70+
%phi7 = phi i64 [ %phi9, %l9 ], [ %phi8, %l8 ]
71+
br i1 %c8, label %l5, label %l8
72+
73+
l8:
74+
%phi8 = phi i64 [ %phi6, %l6_5 ], [ %phi7, %l7 ]
75+
br i1 %c9, label %l7, label %l9
76+
77+
l9:
78+
%phi9 = phi i64 [ %n, %entry ], [ %phi8, %l8 ]
79+
%sub = sub i64 %phi9, 1
80+
%mul = mul i64 %sub, 2
81+
br i1 %c10, label %l7, label %exit
82+
83+
exit:
84+
%res = phi i64 [ %div, %l1 ], [ %mul, %l9]
85+
ret i64 %res
86+
}
87+

0 commit comments

Comments
 (0)