-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[llvm][GlobalOpt] Optimize statically resolvable IFuncs #80606
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Jon Roelofs (jroelofs) ChangesFull diff: https://github.com/llvm/llvm-project/pull/80606.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 951372adcfa93..8fce21a9f9dd9 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -89,6 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
STATISTIC(NumInternalFunc, "Number of internal functions");
STATISTIC(NumColdCC, "Number of functions marked coldcc");
+STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
static cl::opt<bool>
EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2404,6 +2405,42 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
return Changed;
}
+static Function *hasSideeffectFreeStaticResolution(GlobalIFunc &IF) {
+ Function *Resolver = IF.getResolverFunction();
+ if (!Resolver)
+ return nullptr;
+
+ Function *Callee = nullptr;
+ for (BasicBlock &BB : *Resolver) {
+ if (any_of(BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ return nullptr;
+
+ if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+ if (auto *F = dyn_cast<Function>(Ret->getReturnValue())) {
+ if (!Callee)
+ Callee = F;
+ if (F != Callee)
+ return nullptr;
+ }
+ }
+
+ return Callee;
+}
+
+/// Find IFuncs that have resolvers that always point at the same statically
+/// known callee, and replace their callers with a direct call.
+static bool OptimizeStaticIFuncs(Module &M) {
+ bool Changed = false;
+ for (GlobalIFunc &IF : M.ifuncs())
+ if (Function *Callee = hasSideeffectFreeStaticResolution(IF))
+ if (!IF.use_empty()) {
+ IF.replaceAllUsesWith(Callee);
+ NumIFuncsResolved++;
+ Changed = true;
+ }
+ return Changed;
+}
+
static bool
optimizeGlobalsInModule(Module &M, const DataLayout &DL,
function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2464,6 +2501,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
if (CXAAtExitFn)
LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
+ // Optimize IFuncs whose callee's are statically known.
+ LocalChange |= OptimizeStaticIFuncs(M);
+
Changed |= LocalChange;
}
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll
new file mode 100644
index 0000000000000..d51e9111ad2cd
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll
@@ -0,0 +1,81 @@
+; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+$callee_with_trivial_resolver.resolver = comdat any
+@callee_with_trivial_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_trivial_resolver.resolver
+define weak_odr ptr @callee_with_trivial_resolver.resolver() comdat {
+ ret ptr @callee_with_trivial_resolver._Msimd
+}
+define void @callee_with_trivial_resolver._Msimd() {
+ ret void
+}
+define void @callee_with_trivial_resolver.default() {
+ ret void
+}
+
+@unknown_condition = external global i1
+$callee_with_complex_static_resolver.resolver = comdat any
+@callee_with_complex_static_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_complex_static_resolver.resolver
+define weak_odr ptr @callee_with_complex_static_resolver.resolver() comdat {
+entry:
+ %v = load i1, ptr @unknown_condition
+ br i1 %v, label %fast, label %slow
+fast:
+ ret ptr @callee_with_complex_static_resolver._Msimd
+slow:
+ ret ptr @callee_with_complex_static_resolver._Msimd
+}
+define void @callee_with_complex_static_resolver._Msimd() {
+ ret void
+}
+define void @callee_with_complex_static_resolver.default() {
+ ret void
+}
+
+$callee_with_complex_dynamic_resolver.resolver = comdat any
+@callee_with_complex_dynamic_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_complex_dynamic_resolver.resolver
+define weak_odr ptr @callee_with_complex_dynamic_resolver.resolver() comdat {
+entry:
+ %v = load i1, ptr @unknown_condition
+ br i1 %v, label %fast, label %slow
+fast:
+ ret ptr @callee_with_complex_dynamic_resolver._Msimd
+slow:
+ ret ptr @callee_with_complex_dynamic_resolver.default
+}
+define void @callee_with_complex_dynamic_resolver._Msimd() {
+ ret void
+}
+define void @callee_with_complex_dynamic_resolver.default() {
+ ret void
+}
+
+$callee_with_sideeffects_resolver.resolver = comdat any
+@callee_with_sideeffects_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_sideeffects_resolver.resolver
+define weak_odr ptr @callee_with_sideeffects_resolver.resolver() comdat {
+ store i1 0, ptr @unknown_condition
+ ret ptr @callee_with_sideeffects_resolver.default
+}
+define void @callee_with_sideeffects_resolver._Msimd() {
+ ret void
+}
+define void @callee_with_sideeffects_resolver.default() {
+ ret void
+}
+
+define void @caller() {
+ call void @callee_with_trivial_resolver.ifunc()
+ call void @callee_with_complex_static_resolver.ifunc()
+ call void @callee_with_complex_dynamic_resolver.ifunc()
+ call void @callee_with_sideeffects_resolver.ifunc()
+ ret void
+}
+
+; CHECK-LABEL: define void @caller()
+; CHECK-NEXT: call void @callee_with_trivial_resolver._Msimd()
+; CHECK-NEXT: call void @callee_with_complex_static_resolver._Msimd()
+; CHECK-NEXT: call void @callee_with_complex_dynamic_resolver.ifunc()
+; CHECK-NEXT: call void @callee_with_sideeffects_resolver.ifunc()
+; CHECK-NEXT: ret void
|
This is a baby step towards #71714 |
And here's an end-to-end example where it would apply: https://clang.godbolt.org/z/PYx6jT4W5 |
f85979e
to
5318a4d
Compare
return nullptr; | ||
|
||
Function *Callee = nullptr; | ||
for (BasicBlock &BB : *Resolver) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't account for divergence effects (infinite loops). I'd recommend to only handle the case with a single basic block and rely on preceding simplification to reduce it to that form. (Unless there is a phase-ordering reason to believe this will not happen?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great point. I have a follow-up in mind I should ask your advice on then:
I want to extend this (in subsequent patch(es)) to de-virtualize call sites when the result is not quite as obvious. Looking at attributes on the caller will give us some known bits on __aarch64_cpu_features.features
, and if that leads to the resolver being constant-foldable in the context of that caller, we can make a direct call.
Would you clone the resolver for each call-site, RAUW the loads from __aarch64_cpu_features.features
with __aarch64_cpu_features.features | known_bits
and then InstCombine+SimplifyCFG, followed by reverting the call site back to the original resolver if it didn't work out & DCE-ing the remaining cruft? Or would you attempt to do a more analysis/reasoning-based approach?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd go for a symbolic evaluation approach. Basically go through the instructions, try to constant fold (or instsimplify) them, plus handle the specific feature pattern you are looking for. GlobalOpt does something similar for evaluating global_ctors (https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Utils/Evaluator.cpp), though I expect that what you need here would be much simpler. (The hard part of evaluating global_ctors is mutating global state, which presumably isn't relevant here.)
Doing this kind of speculative optimization + rollback is something we try to avoid, as it is expensive. I don't think we use function cloning for this purpose anywhere. (IPSCCP does clone functions, but not as an analysis mechanism.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have tried a few things to achieve what @jroelofs is describing but none of them seems ideal:
- extend IPSCCP to create specializations of the resolver
- constant fold the resolver per callsite in InstCombineCalls.cpp
The second attempt looks better but I am not sure Instcombine allows such expensive transformations. Another concern is that when constant folding I am handling a few types of instructions which I know the FMV resolver on AArch64 will have (binop, compare, select, branch, phi, return). That's quite specific - not generic enough. Also having to use the AArch64 Target Parser to create a constant mask, or looking for loads from address __aarch64_cpu_features doesn't fit well in a target independent optimization. Thoughts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also having to use the AArch64 Target Parser to create a constant mask, or looking for loads from address __aarch64_cpu_features doesn't fit well in a target independent optimization.
These feel like they could be served by TTI hooks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI #87939
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No description provided.