diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0bc955dbbfea3..eaf4108c64db9 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -278,6 +278,10 @@ static cl::opt<bool> EnableO3NonTrivialUnswitching(
     "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
     cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));
 
+static cl::opt<bool> EnableMergeFunctions(
+    "enable-merge-functions", cl::init(false), cl::Hidden,
+    cl::desc("Enable function merging as part of the optimization pipeline"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -287,7 +291,7 @@ PipelineTuningOptions::PipelineTuningOptions() {
   LicmMssaOptCap = SetLicmMssaOptCap;
   LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
   CallGraphProfile = true;
-  MergeFunctions = false;
+  MergeFunctions = EnableMergeFunctions;
 }
 
 namespace llvm {
@@ -1416,23 +1420,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
 
-  // Split out cold code. Splitting is done late to avoid hiding context from
-  // other optimizations and inadvertently regressing performance. The tradeoff
-  // is that this has a higher code size cost than splitting early.
-  if (EnableHotColdSplit && !LTOPreLink)
-    MPM.addPass(HotColdSplittingPass());
-
-  // Search the code for similar regions of code. If enough similar regions can
-  // be found where extracting the regions into their own function will decrease
-  // the size of the program, we extract the regions, a deduplicate the
-  // structurally similar regions.
-  if (EnableIROutliner)
-    MPM.addPass(IROutlinerPass());
-
-  // Merge functions if requested.
-  if (PTO.MergeFunctions)
-    MPM.addPass(MergeFunctionsPass());
-
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -1459,6 +1446,23 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   for (auto &C : OptimizerLastEPCallbacks)
     C(MPM, Level);
 
+  // Split out cold code. Splitting is done late to avoid hiding context from
+  // other optimizations and inadvertently regressing performance. The tradeoff
+  // is that this has a higher code size cost than splitting early.
+  if (EnableHotColdSplit && !LTOPreLink)
+    MPM.addPass(HotColdSplittingPass());
+
+  // Search the code for similar regions of code. If enough similar regions can
+  // be found where extracting the regions into their own function will decrease
+  // the size of the program, we extract the regions, a deduplicate the
+  // structurally similar regions.
+  if (EnableIROutliner)
+    MPM.addPass(IROutlinerPass());
+
+  // Merge functions if requested.
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass());
 
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
new file mode 100644
index 0000000000000..39cd34a98a002
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -enable-merge-functions -S < %s | FileCheck %s
+
+; TODO: These two functions should get merged, but currently aren't, because
+; the function merging pass is scheduled too early.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i1 @test1(i32 %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_TABLEIDX:%.*]] = add i32 [[C:%.*]], -100
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0
+; CHECK-NEXT:    [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
+; CHECK-NEXT:    ret i1 [[I_0]]
+;
+entry:
+  %_4 = alloca i8, align 1
+  %_3 = alloca i8, align 1
+  %_2 = alloca i8, align 1
+  %i = alloca i8, align 1
+  %i1 = icmp eq i32 %c, 115
+  br i1 %i1, label %bb10, label %bb11
+
+bb10:                                             ; preds = %entry
+  store i8 1, i8* %_4, align 1
+  br label %bb12
+
+bb11:                                             ; preds = %entry
+  %_6 = icmp eq i32 %c, 109
+  %i2 = zext i1 %_6 to i8
+  store i8 %i2, i8* %_4, align 1
+  br label %bb12
+
+bb12:                                             ; preds = %bb11, %bb10
+  %i3 = load i8, i8* %_4, align 1
+  %i4 = trunc i8 %i3 to i1
+  br i1 %i4, label %bb7, label %bb8
+
+bb8:                                              ; preds = %bb12
+  %_8 = icmp eq i32 %c, 104
+  %i5 = zext i1 %_8 to i8
+  store i8 %i5, i8* %_3, align 1
+  br label %bb9
+
+bb7:                                              ; preds = %bb12
+  store i8 1, i8* %_3, align 1
+  br label %bb9
+
+bb9:                                              ; preds = %bb7, %bb8
+  %i6 = load i8, i8* %_3, align 1
+  %i7 = trunc i8 %i6 to i1
+  br i1 %i7, label %bb4, label %bb5
+
+bb5:                                              ; preds = %bb9
+  %_10 = icmp eq i32 %c, 100
+  %i8 = zext i1 %_10 to i8
+  store i8 %i8, i8* %_2, align 1
+  br label %bb6
+
+bb4:                                              ; preds = %bb9
+  store i8 1, i8* %_2, align 1
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb5
+  %i9 = load i8, i8* %_2, align 1
+  %i10 = trunc i8 %i9 to i1
+  br i1 %i10, label %bb1, label %bb2
+
+bb2:                                              ; preds = %bb6
+  %_12 = icmp eq i32 %c, 119
+  %i11 = zext i1 %_12 to i8
+  store i8 %i11, i8* %i, align 1
+  br label %bb3
+
+bb1:                                              ; preds = %bb6
+  store i8 1, i8* %i, align 1
+  br label %bb3
+
+bb3:                                              ; preds = %bb1, %bb2
+  %i12 = load i8, i8* %i, align 1
+  %i13 = trunc i8 %i12 to i1
+  ret i1 %i13
+}
+
+define i1 @test2(i32 %c) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @test1(i32 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+entry:
+  %i = alloca i8, align 1
+  switch i32 %c, label %bb1 [
+  i32 115, label %bb2
+  i32 109, label %bb2
+  i32 104, label %bb2
+  i32 100, label %bb2
+  i32 119, label %bb2
+  ]
+
+bb1:                                              ; preds = %entry
+  store i8 0, i8* %i, align 1
+  br label %bb3
+
+bb2:                                              ; preds = %entry, %entry, %entry, %entry, %entry
+  store i8 1, i8* %i, align 1
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb1
+  %i1 = load i8, i8* %i, align 1
+  %i2 = trunc i8 %i1 to i1
+  ret i1 %i2
+}
+