diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 0bc955dbbfea3..eaf4108c64db9 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -278,6 +278,10 @@ static cl::opt EnableO3NonTrivialUnswitching( "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3")); +static cl::opt EnableMergeFunctions( + "enable-merge-functions", cl::init(false), cl::Hidden, + cl::desc("Enable function merging as part of the optimization pipeline")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -287,7 +291,7 @@ PipelineTuningOptions::PipelineTuningOptions() { LicmMssaOptCap = SetLicmMssaOptCap; LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; CallGraphProfile = true; - MergeFunctions = false; + MergeFunctions = EnableMergeFunctions; } namespace llvm { @@ -1416,23 +1420,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); - // Split out cold code. Splitting is done late to avoid hiding context from - // other optimizations and inadvertently regressing performance. The tradeoff - // is that this has a higher code size cost than splitting early. - if (EnableHotColdSplit && !LTOPreLink) - MPM.addPass(HotColdSplittingPass()); - - // Search the code for similar regions of code. If enough similar regions can - // be found where extracting the regions into their own function will decrease - // the size of the program, we extract the regions, a deduplicate the - // structurally similar regions. - if (EnableIROutliner) - MPM.addPass(IROutlinerPass()); - - // Merge functions if requested. - if (PTO.MergeFunctions) - MPM.addPass(MergeFunctionsPass()); - // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM @@ -1459,6 +1446,23 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, for (auto &C : OptimizerLastEPCallbacks) C(MPM, Level); + // Split out cold code. Splitting is done late to avoid hiding context from + // other optimizations and inadvertently regressing performance. The tradeoff + // is that this has a higher code size cost than splitting early. + if (EnableHotColdSplit && !LTOPreLink) + MPM.addPass(HotColdSplittingPass()); + + // Search the code for similar regions of code. If enough similar regions can + // be found where extracting the regions into their own function will decrease + // the size of the program, we extract the regions, a deduplicate the + // structurally similar regions. + if (EnableIROutliner) + MPM.addPass(IROutlinerPass()); + + // Merge functions if requested. + if (PTO.MergeFunctions) + MPM.addPass(MergeFunctionsPass()); + if (PTO.CallGraphProfile) MPM.addPass(CGProfilePass()); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll new file mode 100644 index 0000000000000..39cd34a98a002 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='default' -enable-merge-functions -S < %s | FileCheck %s + +; TODO: These two functions should get merged, but currently aren't, because +; the function merging pass is scheduled too early. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i1 @test1(i32 %c) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SWITCH_TABLEIDX:%.*]] = add i32 [[C:%.*]], -100 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20 +; CHECK-NEXT: [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20 +; CHECK-NEXT: [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]] +; CHECK-NEXT: [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1 +; CHECK-NEXT: [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0 +; CHECK-NEXT: [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false +; CHECK-NEXT: ret i1 [[I_0]] +; +entry: + %_4 = alloca i8, align 1 + %_3 = alloca i8, align 1 + %_2 = alloca i8, align 1 + %i = alloca i8, align 1 + %i1 = icmp eq i32 %c, 115 + br i1 %i1, label %bb10, label %bb11 + +bb10: ; preds = %entry + store i8 1, i8* %_4, align 1 + br label %bb12 + +bb11: ; preds = %entry + %_6 = icmp eq i32 %c, 109 + %i2 = zext i1 %_6 to i8 + store i8 %i2, i8* %_4, align 1 + br label %bb12 + +bb12: ; preds = %bb11, %bb10 + %i3 = load i8, i8* %_4, align 1 + %i4 = trunc i8 %i3 to i1 + br i1 %i4, label %bb7, label %bb8 + +bb8: ; preds = %bb12 + %_8 = icmp eq i32 %c, 104 + %i5 = zext i1 %_8 to i8 + store i8 %i5, i8* %_3, align 1 + br label %bb9 + +bb7: ; preds = %bb12 + store i8 1, i8* %_3, align 1 + br label %bb9 + +bb9: ; preds = %bb7, %bb8 + %i6 = load i8, i8* %_3, align 1 + %i7 = trunc i8 %i6 to i1 + br i1 %i7, label %bb4, label %bb5 + +bb5: ; preds = %bb9 + %_10 = icmp eq i32 %c, 100 + %i8 = zext i1 %_10 to i8 + store i8 %i8, i8* %_2, align 1 + br label %bb6 + +bb4: ; preds = %bb9 + store i8 1, i8* %_2, align 1 + br label %bb6 + +bb6: ; preds = %bb4, %bb5 + %i9 = load i8, i8* %_2, align 1 + %i10 = trunc i8 %i9 to i1 + br i1 %i10, label %bb1, label %bb2 + +bb2: ; preds = %bb6 + %_12 = icmp eq i32 %c, 119 + %i11 = zext i1 %_12 to i8 + store i8 %i11, i8* %i, align 1 + br label %bb3 + +bb1: ; preds = %bb6 + store i8 1, i8* %i, align 1 + br label %bb3 + +bb3: ; preds = %bb1, %bb2 + %i12 = load i8, i8* %i, align 1 + %i13 = trunc i8 %i12 to i1 + ret i1 %i13 +} + +define i1 @test2(i32 %c) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: [[TMP2:%.*]] = tail call i1 @test1(i32 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: ret i1 [[TMP2]] +; +entry: + %i = alloca i8, align 1 + switch i32 %c, label %bb1 [ + i32 115, label %bb2 + i32 109, label %bb2 + i32 104, label %bb2 + i32 100, label %bb2 + i32 119, label %bb2 + ] + +bb1: ; preds = %entry + store i8 0, i8* %i, align 1 + br label %bb3 + +bb2: ; preds = %entry, %entry, %entry, %entry, %entry + store i8 1, i8* %i, align 1 + br label %bb3 + +bb3: ; preds = %bb2, %bb1 + %i1 = load i8, i8* %i, align 1 + %i2 = trunc i8 %i1 to i1 + ret i1 %i2 +} +