Skip to content

Backport MergeFunctions scheduling fix #123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 22 additions & 18 deletions llvm/lib/Passes/PassBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ static cl::opt<bool> EnableO3NonTrivialUnswitching(
"enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));

static cl::opt<bool> EnableMergeFunctions(
"enable-merge-functions", cl::init(false), cl::Hidden,
cl::desc("Enable function merging as part of the optimization pipeline"));

PipelineTuningOptions::PipelineTuningOptions() {
LoopInterleaving = true;
LoopVectorization = true;
Expand All @@ -287,7 +291,7 @@ PipelineTuningOptions::PipelineTuningOptions() {
LicmMssaOptCap = SetLicmMssaOptCap;
LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
CallGraphProfile = true;
MergeFunctions = false;
MergeFunctions = EnableMergeFunctions;
}

namespace llvm {
Expand Down Expand Up @@ -1416,23 +1420,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,

addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);

// Split out cold code. Splitting is done late to avoid hiding context from
// other optimizations and inadvertently regressing performance. The tradeoff
// is that this has a higher code size cost than splitting early.
if (EnableHotColdSplit && !LTOPreLink)
MPM.addPass(HotColdSplittingPass());

// Search the code for similar regions of code. If enough similar regions can
// be found where extracting the regions into their own function will decrease
// the size of the program, we extract the regions, a deduplicate the
// structurally similar regions.
if (EnableIROutliner)
MPM.addPass(IROutlinerPass());

// Merge functions if requested.
if (PTO.MergeFunctions)
MPM.addPass(MergeFunctionsPass());

// LoopSink pass sinks instructions hoisted by LICM, which serves as a
// canonicalization pass that enables other optimizations. As a result,
// LoopSink pass needs to be a very late IR pass to avoid undoing LICM
Expand All @@ -1459,6 +1446,23 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
for (auto &C : OptimizerLastEPCallbacks)
C(MPM, Level);

// Split out cold code. Splitting is done late to avoid hiding context from
// other optimizations and inadvertently regressing performance. The tradeoff
// is that this has a higher code size cost than splitting early.
if (EnableHotColdSplit && !LTOPreLink)
MPM.addPass(HotColdSplittingPass());

// Search the code for similar regions of code. If enough similar regions can
// be found where extracting the regions into their own function will decrease
// the size of the program, we extract the regions, a deduplicate the
// structurally similar regions.
if (EnableIROutliner)
MPM.addPass(IROutlinerPass());

// Merge functions if requested.
if (PTO.MergeFunctions)
MPM.addPass(MergeFunctionsPass());

if (PTO.CallGraphProfile)
MPM.addPass(CGProfilePass());

Expand Down
119 changes: 119 additions & 0 deletions llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes='default<O3>' -enable-merge-functions -S < %s | FileCheck %s

; TODO: These two functions should get merged, but currently aren't, because
; the function merging pass is scheduled too early.

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i1 @test1(i32 %c) {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[SWITCH_TABLEIDX:%.*]] = add i32 [[C:%.*]], -100
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20
; CHECK-NEXT: [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20
; CHECK-NEXT: [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]]
; CHECK-NEXT: [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1
; CHECK-NEXT: [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0
; CHECK-NEXT: [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
; CHECK-NEXT: ret i1 [[I_0]]
;
entry:
%_4 = alloca i8, align 1
%_3 = alloca i8, align 1
%_2 = alloca i8, align 1
%i = alloca i8, align 1
%i1 = icmp eq i32 %c, 115
br i1 %i1, label %bb10, label %bb11

bb10: ; preds = %entry
store i8 1, i8* %_4, align 1
br label %bb12

bb11: ; preds = %entry
%_6 = icmp eq i32 %c, 109
%i2 = zext i1 %_6 to i8
store i8 %i2, i8* %_4, align 1
br label %bb12

bb12: ; preds = %bb11, %bb10
%i3 = load i8, i8* %_4, align 1
%i4 = trunc i8 %i3 to i1
br i1 %i4, label %bb7, label %bb8

bb8: ; preds = %bb12
%_8 = icmp eq i32 %c, 104
%i5 = zext i1 %_8 to i8
store i8 %i5, i8* %_3, align 1
br label %bb9

bb7: ; preds = %bb12
store i8 1, i8* %_3, align 1
br label %bb9

bb9: ; preds = %bb7, %bb8
%i6 = load i8, i8* %_3, align 1
%i7 = trunc i8 %i6 to i1
br i1 %i7, label %bb4, label %bb5

bb5: ; preds = %bb9
%_10 = icmp eq i32 %c, 100
%i8 = zext i1 %_10 to i8
store i8 %i8, i8* %_2, align 1
br label %bb6

bb4: ; preds = %bb9
store i8 1, i8* %_2, align 1
br label %bb6

bb6: ; preds = %bb4, %bb5
%i9 = load i8, i8* %_2, align 1
%i10 = trunc i8 %i9 to i1
br i1 %i10, label %bb1, label %bb2

bb2: ; preds = %bb6
%_12 = icmp eq i32 %c, 119
%i11 = zext i1 %_12 to i8
store i8 %i11, i8* %i, align 1
br label %bb3

bb1: ; preds = %bb6
store i8 1, i8* %i, align 1
br label %bb3

bb3: ; preds = %bb1, %bb2
%i12 = load i8, i8* %i, align 1
%i13 = trunc i8 %i12 to i1
ret i1 %i13
}

define i1 @test2(i32 %c) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: [[TMP2:%.*]] = tail call i1 @test1(i32 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]]
; CHECK-NEXT: ret i1 [[TMP2]]
;
entry:
%i = alloca i8, align 1
switch i32 %c, label %bb1 [
i32 115, label %bb2
i32 109, label %bb2
i32 104, label %bb2
i32 100, label %bb2
i32 119, label %bb2
]

bb1: ; preds = %entry
store i8 0, i8* %i, align 1
br label %bb3

bb2: ; preds = %entry, %entry, %entry, %entry, %entry
store i8 1, i8* %i, align 1
br label %bb3

bb3: ; preds = %bb2, %bb1
%i1 = load i8, i8* %i, align 1
%i2 = trunc i8 %i1 to i1
ret i1 %i2
}