diff --git a/include/swift/SIL/ApplySite.h b/include/swift/SIL/ApplySite.h
index f98608cbfbeb8..5bba3e24ca966 100644
--- a/include/swift/SIL/ApplySite.h
+++ b/include/swift/SIL/ApplySite.h
@@ -583,6 +583,32 @@ class FullApplySite : public ApplySite {
 
 namespace llvm {
 
+template<>
+struct PointerLikeTypeTraits<swift::ApplySite> {
+public:
+  static inline void *getAsVoidPointer(swift::ApplySite apply) {
+    return (void*)apply.getInstruction();
+  }
+  static inline swift::ApplySite getFromVoidPointer(void *pointer) {
+    return swift::ApplySite((swift::SILInstruction*)pointer);
+  }
+  enum { NumLowBitsAvailable =
+         PointerLikeTypeTraits<swift::SILNode *>::NumLowBitsAvailable };
+};
+
+template<>
+struct PointerLikeTypeTraits<swift::FullApplySite> {
+public:
+  static inline void *getAsVoidPointer(swift::FullApplySite apply) {
+    return (void*)apply.getInstruction();
+  }
+  static inline swift::FullApplySite getFromVoidPointer(void *pointer) {
+    return swift::FullApplySite((swift::SILInstruction*)pointer);
+  }
+  enum { NumLowBitsAvailable =
+         PointerLikeTypeTraits<swift::SILNode *>::NumLowBitsAvailable };
+};
+
 // An ApplySite casts like a SILInstruction*.
 template <> struct simplify_type<const ::swift::ApplySite> {
   using SimpleType = ::swift::SILInstruction *;
diff --git a/include/swift/SILOptimizer/Analysis/ArraySemantic.h b/include/swift/SILOptimizer/Analysis/ArraySemantic.h
index 0dbe35c940843..5ecc429dff0f3 100644
--- a/include/swift/SILOptimizer/Analysis/ArraySemantic.h
+++ b/include/swift/SILOptimizer/Analysis/ArraySemantic.h
@@ -45,6 +45,9 @@ enum class ArrayCallKind {
   kArrayUninitializedIntrinsic
 };
 
+/// Return true is the given function is an array semantics call.
+ArrayCallKind getArraySemanticsKind(SILFunction *f);
+
 /// Wrapper around array semantic calls.
 class ArraySemanticsCall {
   ApplyInst *SemanticsCall;
@@ -180,9 +183,6 @@ class ArraySemanticsCall {
 
   /// Could this array be backed by an NSArray.
   bool mayHaveBridgedObjectElementType() const;
-  
-  /// Can this function be inlined by the early inliner.
-  bool canInlineEarly() const;
 
   /// If this is a call to  ArrayUninitialized (or
   /// ArrayUninitializedInstrinsic), identify the instructions that store
diff --git a/include/swift/SILOptimizer/PassManager/PassManager.h b/include/swift/SILOptimizer/PassManager/PassManager.h
index ab90ef989c95f..0f37e046cbf9e 100644
--- a/include/swift/SILOptimizer/PassManager/PassManager.h
+++ b/include/swift/SILOptimizer/PassManager/PassManager.h
@@ -258,16 +258,7 @@ class SILPassManager {
     }
   }
 
-  void executePassPipelinePlan(const SILPassPipelinePlan &Plan) {
-    for (const SILPassPipeline &Pipeline : Plan.getPipelines()) {
-      setStageName(Pipeline.Name);
-      resetAndRemoveTransformations();
-      for (PassKind Kind : Plan.getPipelinePasses(Pipeline)) {
-        addPass(Kind);
-      }
-      execute();
-    }
-  }
+  void executePassPipelinePlan(const SILPassPipelinePlan &Plan);
 
   void registerIRGenPass(PassKind Kind, SILTransform *Transform) {
     assert(IRGenPasses.find(unsigned(Kind)) == IRGenPasses.end() &&
diff --git a/include/swift/SILOptimizer/PassManager/PassPipeline.h b/include/swift/SILOptimizer/PassManager/PassPipeline.h
index c12f339dabe01..acc90e2181aac 100644
--- a/include/swift/SILOptimizer/PassManager/PassPipeline.h
+++ b/include/swift/SILOptimizer/PassManager/PassPipeline.h
@@ -87,7 +87,7 @@ class SILPassPipelinePlan final {
 
   void print(llvm::raw_ostream &os);
 
-  void startPipeline(StringRef Name = "");
+  void startPipeline(StringRef Name = "", bool isFunctionPassPipeline = false);
   using PipelineKindIterator = decltype(Kinds)::const_iterator;
   using PipelineKindRange = iterator_range<PipelineKindIterator>;
   iterator_range<PipelineKindIterator>
@@ -104,11 +104,14 @@ struct SILPassPipeline final {
   unsigned ID;
   StringRef Name;
   unsigned KindOffset;
+  bool isFunctionPassPipeline;
 };
 
-inline void SILPassPipelinePlan::startPipeline(StringRef Name) {
+inline void SILPassPipelinePlan::
+startPipeline(StringRef Name, bool isFunctionPassPipeline) {
   PipelineStages.push_back(SILPassPipeline{
-      unsigned(PipelineStages.size()), Name, unsigned(Kinds.size())});
+      unsigned(PipelineStages.size()), Name, unsigned(Kinds.size()),
+      isFunctionPassPipeline});
 }
 
 inline SILPassPipelinePlan::PipelineKindRange
diff --git a/include/swift/SILOptimizer/Utils/Devirtualize.h b/include/swift/SILOptimizer/Utils/Devirtualize.h
index 6301c64725efd..f7e40221c75a9 100644
--- a/include/swift/SILOptimizer/Utils/Devirtualize.h
+++ b/include/swift/SILOptimizer/Utils/Devirtualize.h
@@ -65,9 +65,11 @@ SubstitutionMap getWitnessMethodSubstitutions(SILModule &Module, ApplySite AI,
 ///
 /// If this succeeds, the caller must call deleteDevirtualizedApply on
 /// the original apply site.
-ApplySite tryDevirtualizeApply(ApplySite AI,
-                               ClassHierarchyAnalysis *CHA,
-                               OptRemark::Emitter *ORE = nullptr);
+///
+/// Return the new apply and true if the CFG was also modified.
+std::pair<ApplySite, bool>
+tryDevirtualizeApply(ApplySite AI, ClassHierarchyAnalysis *CHA,
+                     OptRemark::Emitter *ORE = nullptr);
 bool canDevirtualizeApply(FullApplySite AI, ClassHierarchyAnalysis *CHA);
 bool canDevirtualizeClassMethod(FullApplySite AI, ClassDecl *CD,
                                 OptRemark::Emitter *ORE = nullptr,
@@ -79,21 +81,23 @@ CanType getSelfInstanceType(CanType ClassOrMetatypeType);
 /// Devirtualize the given apply site, which is known to be devirtualizable.
 ///
 /// The caller must call deleteDevirtualizedApply on the original apply site.
-FullApplySite devirtualizeClassMethod(FullApplySite AI,
-                                      SILValue ClassInstance,
-                                      ClassDecl *CD,
-                                      OptRemark::Emitter *ORE);
+///
+/// Return the new apply and true if the CFG was also modified.
+std::pair<FullApplySite, bool> devirtualizeClassMethod(FullApplySite AI,
+                                                       SILValue ClassInstance,
+                                                       ClassDecl *CD,
+                                                       OptRemark::Emitter *ORE);
 
 /// Attempt to devirtualize the given apply site, which is known to be
 /// of a class method.  If this fails, the returned FullApplySite will be null.
 ///
 /// If this succeeds, the caller must call deleteDevirtualizedApply on
 /// the original apply site.
-FullApplySite
-tryDevirtualizeClassMethod(FullApplySite AI,
-                           SILValue ClassInstance,
-                           ClassDecl *CD,
-                           OptRemark::Emitter *ORE,
+///
+/// Return the new apply and true if the CFG was also modified.
+std::pair<FullApplySite, bool>
+tryDevirtualizeClassMethod(FullApplySite AI, SILValue ClassInstance,
+                           ClassDecl *CD, OptRemark::Emitter *ORE,
                            bool isEffectivelyFinalMethod = false);
 
 /// Attempt to devirtualize the given apply site, which is known to be
@@ -102,7 +106,9 @@ tryDevirtualizeClassMethod(FullApplySite AI,
 ///
 /// If this succeeds, the caller must call deleteDevirtualizedApply on
 /// the original apply site.
-ApplySite tryDevirtualizeWitnessMethod(ApplySite AI, OptRemark::Emitter *ORE);
+///
+/// Return the new apply and true if the CFG was also modified.
+std::pair<ApplySite, bool> tryDevirtualizeWitnessMethod(ApplySite AI, OptRemark::Emitter *ORE);
 
 /// Delete a successfully-devirtualized apply site.  This must always be
 /// called after devirtualizing an apply; not only is it not semantically
diff --git a/include/swift/SILOptimizer/Utils/PerformanceInlinerUtils.h b/include/swift/SILOptimizer/Utils/PerformanceInlinerUtils.h
index e6933b2660706..bf34a56cf532c 100644
--- a/include/swift/SILOptimizer/Utils/PerformanceInlinerUtils.h
+++ b/include/swift/SILOptimizer/Utils/PerformanceInlinerUtils.h
@@ -33,18 +33,25 @@ class SideEffectAnalysis;
 // Controls the decision to inline functions with @_semantics, @effect and
 // global_init attributes.
 enum class InlineSelection {
-  Everything,
-  NoGlobalInit, // and no availability semantics calls
-  NoSemanticsAndGlobalInit
+  PreModuleSerialization, // no @semantics, no @availability
+  RetainSemantics,        // Retain the lowest level of @semantic calls
+  Everything              // Full, including global init
 };
 
 // Returns the callee of an apply_inst if it is basically inlinable.
-SILFunction *getEligibleFunction(FullApplySite AI,
-                                 InlineSelection WhatToInline);
+SILFunction *
+getEligibleFunction(FullApplySite AI, InlineSelection WhatToInline,
+                    SmallPtrSetImpl<SILFunction *> &nestedSemanticFunctions);
 
 // Returns true if this is a pure call, i.e. the callee has no side-effects
 // and all arguments are constants.
 bool isPureCall(FullApplySite AI, SideEffectAnalysis *SEA);
+
+// Return true if the given function has a semantic annotation which may be
+// recognized by semantics passes. Such calls should only be inlined after all
+// semantic passes have been able to evaluate them.
+bool isOptimizableSemanticFunction(SILFunction *callee);
+
 } // end swift namespace
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/SILOptimizer/Analysis/ArraySemantic.cpp b/lib/SILOptimizer/Analysis/ArraySemantic.cpp
index 7ccb82ab44774..43e391e302697 100644
--- a/lib/SILOptimizer/Analysis/ArraySemantic.cpp
+++ b/lib/SILOptimizer/Analysis/ArraySemantic.cpp
@@ -21,6 +21,44 @@
 
 using namespace swift;
 
+/// Determine which kind of array semantics function this is.
+ArrayCallKind swift::getArraySemanticsKind(SILFunction *f) {
+  ArrayCallKind Kind = ArrayCallKind::kNone;
+
+  for (auto &Attrs : f->getSemanticsAttrs()) {
+    auto Tmp =
+        llvm::StringSwitch<ArrayCallKind>(Attrs)
+            .Case("array.props.isNativeTypeChecked",
+                  ArrayCallKind::kArrayPropsIsNativeTypeChecked)
+            .StartsWith("array.init", ArrayCallKind::kArrayInit)
+            .Case("array.uninitialized", ArrayCallKind::kArrayUninitialized)
+            .Case("array.uninitialized_intrinsic", ArrayCallKind::kArrayUninitializedIntrinsic)
+            .Case("array.check_subscript", ArrayCallKind::kCheckSubscript)
+            .Case("array.check_index", ArrayCallKind::kCheckIndex)
+            .Case("array.get_count", ArrayCallKind::kGetCount)
+            .Case("array.get_capacity", ArrayCallKind::kGetCapacity)
+            .Case("array.get_element", ArrayCallKind::kGetElement)
+            .Case("array.make_mutable", ArrayCallKind::kMakeMutable)
+            .Case("array.get_element_address",
+                  ArrayCallKind::kGetElementAddress)
+            .Case("array.mutate_unknown", ArrayCallKind::kMutateUnknown)
+            .Case("array.reserve_capacity_for_append",
+                  ArrayCallKind::kReserveCapacityForAppend)
+            .Case("array.withUnsafeMutableBufferPointer",
+                  ArrayCallKind::kWithUnsafeMutableBufferPointer)
+            .Case("array.append_contentsOf", ArrayCallKind::kAppendContentsOf)
+            .Case("array.append_element", ArrayCallKind::kAppendElement)
+            .Default(ArrayCallKind::kNone);
+    if (Tmp != ArrayCallKind::kNone) {
+      assert(Kind == ArrayCallKind::kNone && "Multiple array semantic "
+                                             "strings?!");
+      Kind = Tmp;
+    }
+  }
+
+  return Kind;
+}
+
 static ParameterConvention
 getSelfParameterConvention(ApplyInst *SemanticsCall) {
   FunctionRefInst *FRI = cast<FunctionRefInst>(SemanticsCall->getCallee());
@@ -161,40 +199,7 @@ ArrayCallKind swift::ArraySemanticsCall::getKind() const {
   auto F = cast<FunctionRefInst>(SemanticsCall->getCallee())
                ->getInitiallyReferencedFunction();
 
-  ArrayCallKind Kind = ArrayCallKind::kNone;
-
-  for (auto &Attrs : F->getSemanticsAttrs()) {
-    auto Tmp =
-        llvm::StringSwitch<ArrayCallKind>(Attrs)
-            .Case("array.props.isNativeTypeChecked",
-                  ArrayCallKind::kArrayPropsIsNativeTypeChecked)
-            .StartsWith("array.init", ArrayCallKind::kArrayInit)
-            .Case("array.uninitialized", ArrayCallKind::kArrayUninitialized)
-            .Case("array.uninitialized_intrinsic", ArrayCallKind::kArrayUninitializedIntrinsic)
-            .Case("array.check_subscript", ArrayCallKind::kCheckSubscript)
-            .Case("array.check_index", ArrayCallKind::kCheckIndex)
-            .Case("array.get_count", ArrayCallKind::kGetCount)
-            .Case("array.get_capacity", ArrayCallKind::kGetCapacity)
-            .Case("array.get_element", ArrayCallKind::kGetElement)
-            .Case("array.make_mutable", ArrayCallKind::kMakeMutable)
-            .Case("array.get_element_address",
-                  ArrayCallKind::kGetElementAddress)
-            .Case("array.mutate_unknown", ArrayCallKind::kMutateUnknown)
-            .Case("array.reserve_capacity_for_append",
-                  ArrayCallKind::kReserveCapacityForAppend)
-            .Case("array.withUnsafeMutableBufferPointer",
-                  ArrayCallKind::kWithUnsafeMutableBufferPointer)
-            .Case("array.append_contentsOf", ArrayCallKind::kAppendContentsOf)
-            .Case("array.append_element", ArrayCallKind::kAppendElement)
-            .Default(ArrayCallKind::kNone);
-    if (Tmp != ArrayCallKind::kNone) {
-      assert(Kind == ArrayCallKind::kNone && "Multiple array semantic "
-                                             "strings?!");
-      Kind = Tmp;
-    }
-  }
-
-  return Kind;
+  return getArraySemanticsKind(F);
 }
 
 bool swift::ArraySemanticsCall::hasSelf() const {
@@ -585,26 +590,6 @@ bool swift::ArraySemanticsCall::mayHaveBridgedObjectElementType() const {
   return true;
 }
 
-bool swift::ArraySemanticsCall::canInlineEarly() const {
-  switch (getKind()) {
-    default:
-      return false;
-    case ArrayCallKind::kAppendContentsOf:
-    case ArrayCallKind::kReserveCapacityForAppend:
-    case ArrayCallKind::kAppendElement:
-    case ArrayCallKind::kArrayUninitializedIntrinsic:
-      // append(Element) calls other semantics functions. Therefore it's
-      // important that it's inlined by the early inliner (which is before all
-      // the array optimizations). Also, this semantics is only used to lookup
-      // Array.append(Element), so inlining it does not prevent any other
-      // optimization.
-      //
-      // Early inlining array.uninitialized_intrinsic semantic call helps in
-      // stack promotion.
-      return true;
-  }
-}
-
 SILValue swift::ArraySemanticsCall::getInitializationCount() const {
   if (getKind() == ArrayCallKind::kArrayUninitialized) {
     // Can be either a call to _adoptStorage or _allocateUninitialized.
diff --git a/lib/SILOptimizer/Analysis/EscapeAnalysis.cpp b/lib/SILOptimizer/Analysis/EscapeAnalysis.cpp
index e5b1eec09a578..473156676a422 100644
--- a/lib/SILOptimizer/Analysis/EscapeAnalysis.cpp
+++ b/lib/SILOptimizer/Analysis/EscapeAnalysis.cpp
@@ -1901,6 +1901,7 @@ void EscapeAnalysis::analyzeInstruction(SILInstruction *I,
       !isa<BeginApplyInst>(I)) {
     ArraySemanticsCall ASC(FAS.getInstruction());
     switch (ASC.getKind()) {
+      // TODO: Model ReserveCapacityForAppend, AppendContentsOf, AppendElement.
       case ArrayCallKind::kArrayPropsIsNativeTypeChecked:
       case ArrayCallKind::kCheckSubscript:
       case ArrayCallKind::kCheckIndex:
diff --git a/lib/SILOptimizer/FunctionSignatureTransforms/FunctionSignatureOpts.cpp b/lib/SILOptimizer/FunctionSignatureTransforms/FunctionSignatureOpts.cpp
index 92e6036afb92d..3e3a486dab1a9 100644
--- a/lib/SILOptimizer/FunctionSignatureTransforms/FunctionSignatureOpts.cpp
+++ b/lib/SILOptimizer/FunctionSignatureTransforms/FunctionSignatureOpts.cpp
@@ -630,18 +630,6 @@ bool FunctionSignatureTransform::run(bool hasCaller) {
       TransformDescriptor.hasOnlyDirectInModuleCallers;
   SILFunction *F = TransformDescriptor.OriginalFunction;
 
-  // Never repeat the same function signature optimization on the same function.
-  // Multiple function signature optimizations are composed by successively
-  // optmizing the newly created functions. Each optimization creates a new
-  // level of thunk. Those should all be ultimately inlined away.
-  //
-  // This happens, for example, when a new reference to the original function is
-  // discovered during devirtualization. That will cause the original function
-  // (now and FSO thunk) to be pushed back on the function pass pipeline.
-  if (F->isThunk() == IsSignatureOptimizedThunk) {
-    LLVM_DEBUG(llvm::dbgs() << "  FSO already performed on this thunk\n");
-    return false;
-  }
 
   // If we are asked to assume a caller for testing purposes, set the flag.
   hasCaller |= FSOOptimizeIfNotCalled;
@@ -803,6 +791,19 @@ class FunctionSignatureOpts : public SILFunctionTransform {
       return;
     }
 
+    // Never repeat the same function signature optimization on the same
+    // function. Multiple function signature optimizations are composed by
+    // successively optmizing the newly created functions. Each optimization
+    // creates a new level of thunk which are all ultimately inlined away.
+    //
+    // This happens, for example, when a reference to the original function is
+    // discovered during devirtualization. That will cause the original function
+    // (now an FSO thunk) to be pushed back on the function pass pipeline.
+    if (F->isThunk() == IsSignatureOptimizedThunk) {
+      LLVM_DEBUG(llvm::dbgs() << "  FSO already performed on this thunk\n");
+      return;
+    }
+
     // Ok, we think we can perform optimization. Now perform a quick check
     auto *RCIA = getAnalysis<RCIdentityAnalysis>();
     auto *EA = PM->getAnalysis<EpilogueARCAnalysis>();
diff --git a/lib/SILOptimizer/LoopTransforms/LoopUnroll.cpp b/lib/SILOptimizer/LoopTransforms/LoopUnroll.cpp
index 752e74ecc7001..5671194544ffd 100644
--- a/lib/SILOptimizer/LoopTransforms/LoopUnroll.cpp
+++ b/lib/SILOptimizer/LoopTransforms/LoopUnroll.cpp
@@ -192,7 +192,10 @@ static bool canAndShouldUnrollLoop(SILLoop *Loop, uint64_t TripCount) {
         ++Cost;
       if (auto AI = FullApplySite::isa(&Inst)) {
         auto Callee = AI.getCalleeFunction();
-        if (Callee && getEligibleFunction(AI, InlineSelection::Everything)) {
+        SmallPtrSet<SILFunction *, 1> nestedSemanticFunctions;
+        if (Callee
+            && getEligibleFunction(AI, InlineSelection::Everything,
+                                   nestedSemanticFunctions)) {
           // If callee is rather big and potentialy inlinable, it may be better
           // not to unroll, so that the body of the calle can be inlined later.
           Cost += Callee->size() * InsnsPerBB;
diff --git a/lib/SILOptimizer/Mandatory/MandatoryInlining.cpp b/lib/SILOptimizer/Mandatory/MandatoryInlining.cpp
index b323bbb69642e..be20f7178a33d 100644
--- a/lib/SILOptimizer/Mandatory/MandatoryInlining.cpp
+++ b/lib/SILOptimizer/Mandatory/MandatoryInlining.cpp
@@ -755,7 +755,7 @@ getCalleeFunction(SILFunction *F, FullApplySite AI, bool &IsThick,
 
 static SILInstruction *tryDevirtualizeApplyHelper(FullApplySite InnerAI,
                                                   ClassHierarchyAnalysis *CHA) {
-  auto NewInst = tryDevirtualizeApply(InnerAI, CHA);
+  auto NewInst = tryDevirtualizeApply(InnerAI, CHA).first;
   if (!NewInst)
     return InnerAI.getInstruction();
 
diff --git a/lib/SILOptimizer/PassManager/PassManager.cpp b/lib/SILOptimizer/PassManager/PassManager.cpp
index ef2e70b71957f..8bd9dc623920c 100644
--- a/lib/SILOptimizer/PassManager/PassManager.cpp
+++ b/lib/SILOptimizer/PassManager/PassManager.cpp
@@ -592,6 +592,19 @@ void SILPassManager::runModulePass(unsigned TransIdx) {
   }
 }
 
+void SILPassManager::executePassPipelinePlan(const SILPassPipelinePlan &Plan) {
+  for (const SILPassPipeline &Pipeline : Plan.getPipelines()) {
+    setStageName(Pipeline.Name);
+    resetAndRemoveTransformations();
+    for (PassKind Kind : Plan.getPipelinePasses(Pipeline)) {
+      addPass(Kind);
+      assert(!Pipeline.isFunctionPassPipeline
+             || isa<SILFunctionTransform>(Transformations.back()));
+    }
+    execute();
+  }
+}
+
 void SILPassManager::execute() {
   const SILOptions &Options = getOptions();
 
diff --git a/lib/SILOptimizer/PassManager/PassPipeline.cpp b/lib/SILOptimizer/PassManager/PassPipeline.cpp
index 67357b3759a1c..9bd0dbd58bf07 100644
--- a/lib/SILOptimizer/PassManager/PassPipeline.cpp
+++ b/lib/SILOptimizer/PassManager/PassPipeline.cpp
@@ -260,7 +260,13 @@ void addHighLevelLoopOptPasses(SILPassPipelinePlan &P) {
 }
 
 // Perform classic SSA optimizations.
-void addSSAPasses(SILPassPipelinePlan &P, OptimizationLevelKind OpLevel) {
+//
+// No Module Passes Allowed!
+//
+// Do not add any module passes to this pipeline. It will break the pipeline
+// restart functionality.
+void addFunctionPasses(SILPassPipelinePlan &P,
+                          OptimizationLevelKind OpLevel) {
   // Promote box allocations to stack allocations.
   P.addAllocBoxToStack();
 
@@ -284,11 +290,26 @@ void addSSAPasses(SILPassPipelinePlan &P, OptimizationLevelKind OpLevel) {
 
   // Cleanup, which is important if the inliner has restarted the pass pipeline.
   P.addPerformanceConstantPropagation();
-  P.addSimplifyCFG();
   P.addSILCombine();
+  addSimplifyCFGSILCombinePasses(P);
 
-  // Mainly for Array.append(contentsOf) optimization.
-  P.addArrayElementPropagation();
+  // Perform a round of loop/array optimization in the mid-level pipeline after
+  // potentially inlining semantic calls, e.g. Array append. The high level
+  // pipeline only optimizes semantic calls *after* inlining (see
+  // addHighLevelLoopOptPasses). For example, the high-level pipeline may
+  // performs ArrayElementPropagation and after inlining a level of semantic
+  // calls, the mid-level pipeline may handle uniqueness hoisting. Do this as
+  // late as possible before inlining because it must run between runs of the
+  // inliner when the pipeline restarts.
+  if (OpLevel == OptimizationLevelKind::MidLevel) {
+    P.addHighLevelLICM();
+    P.addArrayCountPropagation();
+    P.addABCOpt();
+    P.addDCE();
+    P.addCOWArrayOpts();
+    P.addDCE();
+    P.addSwiftArrayPropertyOpt();
+  }
 
   // Run the devirtualizer, specializer, and inliner. If any of these
   // makes a change we'll end up restarting the function passes on the
@@ -305,22 +326,6 @@ void addSSAPasses(SILPassPipelinePlan &P, OptimizationLevelKind OpLevel) {
     P.addEarlyInliner();
     break;
   case OptimizationLevelKind::MidLevel:
-    P.addGlobalOpt();
-    P.addLetPropertiesOpt();
-    // It is important to serialize before any of the @_semantics
-    // functions are inlined, because otherwise the information about
-    // uses of such functions inside the module is lost,
-    // which reduces the ability of the compiler to optimize clients
-    // importing this module.
-    P.addSerializeSILPass();
-
-    // Now strip any transparent functions that still have ownership.
-    if (P.getOptions().StripOwnershipAfterSerialization)
-      P.addOwnershipModelEliminator();
-
-    if (P.getOptions().StopOptimizationAfterSerialization)
-      return;
-
     // Does inline semantics-functions (except "availability"), but not
     // global-init functions.
     P.addPerfInliner();
@@ -419,30 +424,58 @@ static void addPerfEarlyModulePassPipeline(SILPassPipelinePlan &P) {
   P.addCMOSerializeSILPass();
 }
 
-static void addHighLevelEarlyLoopOptPipeline(SILPassPipelinePlan &P) {
-  P.startPipeline("HighLevel+EarlyLoopOpt");
-  // FIXME: update this to be a function pass.
+// The "high-level" pipeline serves two purposes:
+//
+// 1. Optimize the standard library Swift module prior to serialization. This
+// reduces the amount of work during compilation of all non-stdlib clients.
+//
+// 2. Optimizer caller functions before inlining semantic calls inside
+// callees. This provides more precise escape analysis and side effect analysis
+// of callee arguments.
+static void addHighLevelFunctionPipeline(SILPassPipelinePlan &P) {
+  P.startPipeline("HighLevel,Function+EarlyLoopOpt");
+  // FIXME: update EagerSpecializer to be a function pass!
   P.addEagerSpecializer();
-  addSSAPasses(P, OptimizationLevelKind::HighLevel);
+  addFunctionPasses(P, OptimizationLevelKind::HighLevel);
+
   addHighLevelLoopOptPasses(P);
 }
 
-static void addMidModulePassesStackPromotePassPipeline(SILPassPipelinePlan &P) {
-  P.startPipeline("MidModulePasses+StackPromote");
+// After "high-level" function passes have processed the entire call tree, run
+// one round of module passes.
+//
+// It is not clear why stack promotion
+static void addHighLevelModulePipeline(SILPassPipelinePlan &P) {
+  P.startPipeline("HighLevel,Module+StackPromote");
   P.addDeadFunctionElimination();
   P.addPerformanceSILLinker();
   P.addDeadObjectElimination();
   P.addGlobalPropertyOpt();
 
-  // Do the first stack promotion on high-level SIL.
+  // Do the first stack promotion on high-level SIL before serialization.
   P.addStackPromotion();
+
+  P.addGlobalOpt();
+  P.addLetPropertiesOpt();
 }
 
-static bool addMidLevelPassPipeline(SILPassPipelinePlan &P) {
-  P.startPipeline("MidLevel");
-  addSSAPasses(P, OptimizationLevelKind::MidLevel);
-  if (P.getOptions().StopOptimizationAfterSerialization)
-    return true;
+static void addSerializePipeline(SILPassPipelinePlan &P) {
+  P.startPipeline("Serialize");
+  // It is important to serialize before any of the @_semantics
+  // functions are inlined, because otherwise the information about
+  // uses of such functions inside the module is lost,
+  // which reduces the ability of the compiler to optimize clients
+  // importing this module.
+  P.addSerializeSILPass();
+
+  // Now strip any transparent functions that still have ownership.
+  if (P.getOptions().StripOwnershipAfterSerialization)
+    P.addOwnershipModelEliminator();
+}
+
+static void addMidLevelFunctionPipeline(SILPassPipelinePlan &P) {
+  P.startPipeline("MidLevel,Function", true /*isFunctionPassPipeline*/);
+  addFunctionPasses(P, OptimizationLevelKind::MidLevel);
 
   // Specialize partially applied functions with dead arguments as a preparation
   // for CapturePropagation.
@@ -451,7 +484,6 @@ static bool addMidLevelPassPipeline(SILPassPipelinePlan &P) {
   // Run loop unrolling after inlining and constant propagation, because loop
   // trip counts may have became constant.
   P.addLoopUnroll();
-  return false;
 }
 
 static void addClosureSpecializePassPipeline(SILPassPipelinePlan &P) {
@@ -502,12 +534,12 @@ static void addClosureSpecializePassPipeline(SILPassPipelinePlan &P) {
 }
 
 static void addLowLevelPassPipeline(SILPassPipelinePlan &P) {
-  P.startPipeline("LowLevel");
+  P.startPipeline("LowLevel,Function", true /*isFunctionPassPipeline*/);
 
   // Should be after FunctionSignatureOpts and before the last inliner.
   P.addReleaseDevirtualizer();
 
-  addSSAPasses(P, OptimizationLevelKind::LowLevel);
+  addFunctionPasses(P, OptimizationLevelKind::LowLevel);
 
   P.addDeadObjectElimination();
   P.addObjectOutliner();
@@ -635,13 +667,21 @@ SILPassPipelinePlan::getPerformancePassPipeline(const SILOptions &Options) {
   addPerfEarlyModulePassPipeline(P);
 
   // Then run an iteration of the high-level SSA passes.
-  addHighLevelEarlyLoopOptPipeline(P);
-  addMidModulePassesStackPromotePassPipeline(P);
+  //
+  // FIXME: When *not* emitting a .swiftmodule, skip the high-level function
+  // pipeline to save compile time.
+  addHighLevelFunctionPipeline(P);
 
-  // Run an iteration of the mid-level SSA passes.
-  if (addMidLevelPassPipeline(P))
+  addHighLevelModulePipeline(P);
+
+  addSerializePipeline(P);
+  if (Options.StopOptimizationAfterSerialization)
     return P;
 
+  // After serialization run the function pass pipeline to iteratively lower
+  // high-level constructs like @_semantics calls.
+  addMidLevelFunctionPipeline(P);
+
   // Perform optimizations that specialize.
   addClosureSpecializePassPipeline(P);
 
@@ -681,7 +721,7 @@ SILPassPipelinePlan::getOnonePassPipeline(const SILOptions &Options) {
   P.startPipeline("Serialization");
   P.addSerializeSILPass();
 
-  // And then strip ownership...
+  // Now strip any transparent functions that still have ownership.
   if (Options.StripOwnershipAfterSerialization)
     P.addOwnershipModelEliminator();
 
diff --git a/lib/SILOptimizer/Transforms/Devirtualizer.cpp b/lib/SILOptimizer/Transforms/Devirtualizer.cpp
index 27e140b6993be..cad6fdd08f15a 100644
--- a/lib/SILOptimizer/Transforms/Devirtualizer.cpp
+++ b/lib/SILOptimizer/Transforms/Devirtualizer.cpp
@@ -30,8 +30,10 @@ using namespace swift;
 namespace {
 
 class Devirtualizer : public SILFunctionTransform {
+  bool Changed = false;
+  bool ChangedCFG = false;
 
-  bool devirtualizeAppliesInFunction(SILFunction &F,
+  void devirtualizeAppliesInFunction(SILFunction &F,
                                      ClassHierarchyAnalysis *CHA);
 
   /// The entry point to the transformation.
@@ -41,7 +43,12 @@ class Devirtualizer : public SILFunctionTransform {
     LLVM_DEBUG(llvm::dbgs() << "***** Devirtualizer on function:" << F.getName()
                             << " *****\n");
 
-    if (devirtualizeAppliesInFunction(F, CHA))
+    Changed = false;
+    ChangedCFG = false;
+    devirtualizeAppliesInFunction(F, CHA);
+    if (ChangedCFG)
+      invalidateAnalysis(SILAnalysis::InvalidationKind::Everything);
+    else if (Changed)
       invalidateAnalysis(SILAnalysis::InvalidationKind::CallsAndInstructions);
   }
 
@@ -49,9 +56,9 @@ class Devirtualizer : public SILFunctionTransform {
 
 } // end anonymous namespace
 
-bool Devirtualizer::devirtualizeAppliesInFunction(SILFunction &F,
+// Return true if any calls changed, and true if the CFG also changed.
+void Devirtualizer::devirtualizeAppliesInFunction(SILFunction &F,
                                                   ClassHierarchyAnalysis *CHA) {
-  bool Changed = false;
   llvm::SmallVector<ApplySite, 8> NewApplies;
   OptRemark::Emitter ORE(DEBUG_TYPE, F.getModule());
 
@@ -69,11 +76,14 @@ bool Devirtualizer::devirtualizeAppliesInFunction(SILFunction &F,
    }
   }
   for (auto Apply : Applies) {
-    auto NewInst = tryDevirtualizeApply(Apply, CHA, &ORE);
+    ApplySite NewInst;
+    bool modifiedCFG;
+    std::tie(NewInst, modifiedCFG) = tryDevirtualizeApply(Apply, CHA, &ORE);
     if (!NewInst)
       continue;
 
     Changed = true;
+    ChangedCFG |= modifiedCFG;
 
     deleteDevirtualizedApply(Apply);
     NewApplies.push_back(NewInst);
@@ -105,8 +115,6 @@ bool Devirtualizer::devirtualizeAppliesInFunction(SILFunction &F,
     if (CalleeFn->isDefinition() && CalleeFn->shouldOptimize())
       addFunctionToPassManagerWorklist(CalleeFn, nullptr);
   }
-
-  return Changed;
 }
 
 SILTransform *swift::createDevirtualizer() { return new Devirtualizer(); }
diff --git a/lib/SILOptimizer/Transforms/PerformanceInliner.cpp b/lib/SILOptimizer/Transforms/PerformanceInliner.cpp
index f03e5bd1d7f40..9b7b2b62a5a21 100644
--- a/lib/SILOptimizer/Transforms/PerformanceInliner.cpp
+++ b/lib/SILOptimizer/Transforms/PerformanceInliner.cpp
@@ -67,6 +67,12 @@ class SILPerformanceInliner {
   llvm::DenseMap<SILFunction *, ShortestPathAnalysis *> SPAs;
   llvm::SpecificBumpPtrAllocator<ShortestPathAnalysis> SPAAllocator;
 
+  // Mark semantic functions that have nested semantic calls. This is
+  // effectively an immutable cache since we do not inline semantic calls into
+  // other semantic calls. This is computed bottom up--when checking a call
+  // site, we assume that a callee has already been evaluated.
+  llvm::SmallPtrSet<SILFunction *, 8> nestedSemanticFunctions;
+
   ColdBlockInfo CBI;
 
   OptRemark::Emitter &ORE;
@@ -101,6 +107,9 @@ class SILPerformanceInliner {
     /// The benefit of a onFastPath builtin.
     FastPathBuiltinBenefit = RemovedCallBenefit + 40,
 
+    /// The benefit of inlining a function with a semantic call site.
+    SemanticCallBenefit = RemovedCallBenefit + 50,
+
     /// The benefit of being able to devirtualize a call.
     DevirtualizedCallBenefit = RemovedCallBenefit + 300,
 
@@ -348,6 +357,16 @@ bool SILPerformanceInliner::isProfitableToInline(
       CalleeCost += (int)instructionInlineCost(I);
 
       if (FullApplySite FAI = FullApplySite::isa(&I)) {
+        // Call sites into semantic calls need to be inlined into the parent
+        // scope for optimization based on those semantics to kick in. This may
+        // mean the call can be hoisted out of a loop for example. Do this only
+        // after the top-level scope is fully specialized, otherwise it could
+        // actually prevent inlining of callers.
+        SILFunction *Callee = FAI.getReferencedFunctionOrNull();
+        if (!IsGeneric && Callee && Callee->hasSemanticsAttrs()) {
+          BlockW.updateBenefit(Benefit, SemanticCallBenefit);
+        }
+
         // Check if the callee is passed as an argument. If so, increase the
         // threshold, because inlining will (probably) eliminate the closure.
         SILInstruction *def = constTracker.getDefInCaller(FAI.getCallee());
@@ -361,9 +380,9 @@ bool SILPerformanceInliner::isProfitableToInline(
         if (!def)
           continue;
 
+        // Ignore anything else that is not a generic call or if inlining of
+        // generics is forbidden.
         auto Subs = FAI.getSubstitutionMap();
-
-        // Bail if it is not a generic call or inlining of generics is forbidden.
         if (!EnableSILInliningOfGenerics || !Subs.hasAnySubstitutableParams())
           continue;
 
@@ -494,7 +513,7 @@ bool SILPerformanceInliner::isProfitableToInline(
 
   // This is the final inlining decision.
   if (CalleeCost > Benefit) {
-    ORE.emit([&]() {
+    OptRemark::Emitter::emitOrDebug(DEBUG_TYPE, &ORE, [&]() {
       using namespace OptRemark;
       return RemarkMissed("NoInlinedCost", *AI.getInstruction())
              << "Not profitable to inline function " << NV("Callee", Callee)
@@ -514,7 +533,7 @@ bool SILPerformanceInliner::isProfitableToInline(
                           << ", bb=" << Callee->size()
                           << ", c-bb=" << NumCallerBlocks
                           << "} " << Callee->getName() << '\n');
-  ORE.emit([&]() {
+  OptRemark::Emitter::emitOrDebug(DEBUG_TYPE, &ORE, [&]() {
     using namespace OptRemark;
     return RemarkPassed("Inlined", *AI.getInstruction())
            << NV("Callee", Callee) << " inlined into "
@@ -765,7 +784,8 @@ void SILPerformanceInliner::collectAppliesToInline(
     // At this occasion we record additional weight increases.
     addWeightCorrection(FAS, WeightCorrections);
 
-    if (SILFunction *Callee = getEligibleFunction(FAS, WhatToInline)) {
+    if (SILFunction *Callee =
+            getEligibleFunction(FAS, WhatToInline, nestedSemanticFunctions)) {
       // Compute the shortest-path analysis for the callee.
       SILLoopInfo *CalleeLI = LA->get(Callee);
       ShortestPathAnalysis *CalleeSPA = getSPA(Callee, CalleeLI);
@@ -792,6 +812,8 @@ void SILPerformanceInliner::collectAppliesToInline(
   }
 #endif
 
+  bool semanticFunction = isOptimizableSemanticFunction(Caller);
+
   ConstantTracker constTracker(Caller);
   DominanceOrder domOrder(&Caller->front(), DT, Caller->size());
   int NumCallerBlocks = (int)Caller->size();
@@ -814,8 +836,13 @@ void SILPerformanceInliner::collectAppliesToInline(
 
       FullApplySite AI = FullApplySite(&*I);
 
-      auto *Callee = getEligibleFunction(AI, WhatToInline);
+      auto *Callee =
+          getEligibleFunction(AI, WhatToInline, nestedSemanticFunctions);
       if (Callee) {
+        // Mark nested semantic functions to guide inlining of callers.
+        if (semanticFunction && isOptimizableSemanticFunction(Callee))
+          nestedSemanticFunctions.insert(Caller);
+
         // Check if we have an always_inline or transparent function. If we do,
         // just add it to our final Applies list and continue.
         if (isInlineAlwaysCallSite(Callee)) {
@@ -893,6 +920,27 @@ bool SILPerformanceInliner::inlineCallsIntoFunction(SILFunction *Caller) {
   if (AppliesToInline.empty())
     return false;
 
+  //!!!
+  bool trace = false;
+  if (Caller->hasName("$s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF")) {
+    //!!!llvm::dbgs() << "inlining into testInlineAppend\n";
+    //!!!trace = true;
+  }
+
+  llvm::SmallPtrSet<FullApplySite, 4> nestedSemanticCalls;
+  for (auto fullApply : AppliesToInline) {
+    //!!!
+    if (false
+        && nestedSemanticFunctions.count(
+            fullApply.getReferencedFunctionOrNull())) {
+      //!!!
+      if (trace)
+        llvm::dbgs() << "Nested: "
+                     << fullApply.getReferencedFunctionOrNull()->getName() << "\n";
+      nestedSemanticCalls.insert(fullApply);
+    }
+  }
+
   // Second step: do the actual inlining.
   for (auto AI : AppliesToInline) {
     SILFunction *Callee = AI.getReferencedFunctionOrNull();
@@ -902,6 +950,16 @@ bool SILPerformanceInliner::inlineCallsIntoFunction(SILFunction *Caller) {
       continue;
     }
 
+    // If this function calls any nested semantics, only inline the nested
+    // semantic calls.
+    if (!nestedSemanticCalls.empty() && !nestedSemanticCalls.count(AI)) {
+      //!!!
+      if (trace)
+        llvm::dbgs() << "Non Nested: " << Callee->getName() << "\n";
+      if (ArraySemanticsCall(AI.getInstruction()))
+        continue;
+    }
+
     // If we have a callee that doesn't have ownership, but the caller does have
     // ownership... do not inline. The two modes are incompatible. Today this
     // should only happen with transparent functions.
@@ -952,7 +1010,8 @@ void SILPerformanceInliner::visitColdBlocks(
       if (!AI)
         continue;
 
-      auto *Callee = getEligibleFunction(AI, WhatToInline);
+      auto *Callee =
+          getEligibleFunction(AI, WhatToInline, nestedSemanticFunctions);
       if (Callee && decideInColdBlock(AI, Callee)) {
         AppliesToInline.push_back(AI);
       }
@@ -1012,16 +1071,17 @@ class SILPerformanceInlinerPass : public SILFunctionTransform {
 } // end anonymous namespace
 
 /// Create an inliner pass that does not inline functions that are marked with
-/// the @_semantics, @_effects or global_init attributes.
+/// the @_semantics, @_effects, availability, or global_init attributes.
 SILTransform *swift::createEarlyInliner() {
-  return new SILPerformanceInlinerPass(
-    InlineSelection::NoSemanticsAndGlobalInit, "Early");
+  return new SILPerformanceInlinerPass(InlineSelection::PreModuleSerialization,
+                                       "Early");
 }
 
-/// Create an inliner pass that does not inline functions that are marked with
-/// the global_init attribute or have an "availability" semantics attribute.
+// The mid-level inliner preserves the lowest level of semantic calls to avoid
+// pessimizing anlayses like EscapeAnlysis and SideEffectAnalysis.
 SILTransform *swift::createPerfInliner() {
-  return new SILPerformanceInlinerPass(InlineSelection::NoGlobalInit, "Middle");
+  return new SILPerformanceInlinerPass(InlineSelection::RetainSemantics,
+                                       "Middle");
 }
 
 /// Create an inliner pass that inlines all functions that are marked with
diff --git a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
index 9fba73ceb7c3b..211ca86a8e405 100644
--- a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
+++ b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
@@ -3027,6 +3027,10 @@ bool SimplifyCFG::run() {
 
   LLVM_DEBUG(llvm::dbgs() << "### Run SimplifyCFG on " << Fn.getName() << '\n');
 
+  //!!!
+  if (Fn.hasName("$s32sil_combine_concrete_existential29testWitnessReturnOptionalSelfAA2PP_pSgyF")) {
+    llvm::dbgs() << "TEST WITNESS SIMPLIFYCFG\n";
+  }
   // Disable some expensive optimizations if the function is huge.
   isVeryLargeFunction = (Fn.size() > 10000);
 
diff --git a/lib/SILOptimizer/Transforms/SpeculativeDevirtualizer.cpp b/lib/SILOptimizer/Transforms/SpeculativeDevirtualizer.cpp
index 2ca01e247fd21..b44f36f67b523 100644
--- a/lib/SILOptimizer/Transforms/SpeculativeDevirtualizer.cpp
+++ b/lib/SILOptimizer/Transforms/SpeculativeDevirtualizer.cpp
@@ -229,7 +229,8 @@ static FullApplySite speculateMonomorphicTarget(FullApplySite AI,
 
   // Devirtualize the apply instruction on the identical path.
   auto NewInst =
-    devirtualizeClassMethod(IdenAI, DownCastedClassInstance, CD, nullptr);
+      devirtualizeClassMethod(IdenAI, DownCastedClassInstance, CD, nullptr)
+          .first;
   assert(NewInst && "Expected to be able to devirtualize apply!");
   (void)NewInst;
 
@@ -414,7 +415,8 @@ static bool tryToSpeculateTarget(FullApplySite AI, ClassHierarchyAnalysis *CHA,
     // try to devirtualize it completely.
     ClassHierarchyAnalysis::ClassList Subs;
     if (isDefaultCaseKnown(CHA, AI, CD, Subs)) {
-      auto NewInst = tryDevirtualizeClassMethod(AI, SubTypeValue, CD, &ORE);
+      auto NewInst =
+          tryDevirtualizeClassMethod(AI, SubTypeValue, CD, &ORE).first;
       if (NewInst)
         deleteDevirtualizedApply(AI);
       return bool(NewInst);
@@ -574,7 +576,8 @@ static bool tryToSpeculateTarget(FullApplySite AI, ClassHierarchyAnalysis *CHA,
     ORE.emit(RB);
     return true;
   }
-  auto NewInst = tryDevirtualizeClassMethod(AI, SubTypeValue, CD, nullptr);
+  auto NewInst =
+      tryDevirtualizeClassMethod(AI, SubTypeValue, CD, nullptr).first;
   if (NewInst) {
     ORE.emit(RB);
     deleteDevirtualizedApply(AI);
diff --git a/lib/SILOptimizer/Utils/Devirtualize.cpp b/lib/SILOptimizer/Utils/Devirtualize.cpp
index d743b79cf6e78..0d3ca65970497 100644
--- a/lib/SILOptimizer/Utils/Devirtualize.cpp
+++ b/lib/SILOptimizer/Utils/Devirtualize.cpp
@@ -482,12 +482,12 @@ static ApplyInst *replaceApplyInst(SILBuilder &builder, SILLocation loc,
   return newAI;
 }
 
-static TryApplyInst *replaceTryApplyInst(SILBuilder &builder, SILLocation loc,
-                                         TryApplyInst *oldTAI, SILValue newFn,
-                                         SubstitutionMap newSubs,
-                                         ArrayRef<SILValue> newArgs,
-                                         SILFunctionConventions conv,
-                                         ArrayRef<SILValue> newArgBorrows) {
+// Return the new try_apply and true if a cast required CFG modification.
+static std::pair<TryApplyInst *, bool>
+replaceTryApplyInst(SILBuilder &builder, SILLocation loc, TryApplyInst *oldTAI,
+                    SILValue newFn, SubstitutionMap newSubs,
+                    ArrayRef<SILValue> newArgs, SILFunctionConventions conv,
+                    ArrayRef<SILValue> newArgBorrows) {
   SILBasicBlock *normalBB = oldTAI->getNormalBB();
   SILBasicBlock *resultBB = nullptr;
 
@@ -537,7 +537,7 @@ static TryApplyInst *replaceTryApplyInst(SILBuilder &builder, SILLocation loc,
   }
 
   builder.setInsertionPoint(normalBB->begin());
-  return newTAI;
+  return {newTAI, resultCastRequired};
 }
 
 static BeginApplyInst *
@@ -599,17 +599,18 @@ replacePartialApplyInst(SILBuilder &builder, SILLocation loc,
   return newPAI;
 }
 
-static ApplySite replaceApplySite(SILBuilder &builder, SILLocation loc,
-                                  ApplySite oldAS, SILValue newFn,
-                                  SubstitutionMap newSubs,
-                                  ArrayRef<SILValue> newArgs,
-                                  SILFunctionConventions conv,
-                                  ArrayRef<SILValue> newArgBorrows) {
+// Return the new apply and true if the CFG was also modified.
+static std::pair<ApplySite, bool>
+replaceApplySite(SILBuilder &builder, SILLocation loc, ApplySite oldAS,
+                 SILValue newFn, SubstitutionMap newSubs,
+                 ArrayRef<SILValue> newArgs, SILFunctionConventions conv,
+                 ArrayRef<SILValue> newArgBorrows) {
   switch (oldAS.getKind()) {
   case ApplySiteKind::ApplyInst: {
     auto *oldAI = cast<ApplyInst>(oldAS);
-    return replaceApplyInst(builder, loc, oldAI, newFn, newSubs, newArgs,
-                            newArgBorrows);
+    return {replaceApplyInst(builder, loc, oldAI, newFn, newSubs, newArgs,
+                             newArgBorrows),
+            false};
   }
   case ApplySiteKind::TryApplyInst: {
     auto *oldTAI = cast<TryApplyInst>(oldAS);
@@ -618,14 +619,16 @@ static ApplySite replaceApplySite(SILBuilder &builder, SILLocation loc,
   }
   case ApplySiteKind::BeginApplyInst: {
     auto *oldBAI = dyn_cast<BeginApplyInst>(oldAS);
-    return replaceBeginApplyInst(builder, loc, oldBAI, newFn, newSubs, newArgs,
-                                 newArgBorrows);
+    return {replaceBeginApplyInst(builder, loc, oldBAI, newFn, newSubs, newArgs,
+                                  newArgBorrows),
+            false};
   }
   case ApplySiteKind::PartialApplyInst: {
     assert(newArgBorrows.empty());
     auto *oldPAI = cast<PartialApplyInst>(oldAS);
-    return replacePartialApplyInst(builder, loc, oldPAI, newFn, newSubs,
-                                   newArgs);
+    return {
+        replacePartialApplyInst(builder, loc, oldPAI, newFn, newSubs, newArgs),
+        false};
   }
   }
   llvm_unreachable("covered switch");
@@ -729,10 +732,12 @@ bool swift::canDevirtualizeClassMethod(FullApplySite applySite, ClassDecl *cd,
 /// \p ClassOrMetatype is a class value or metatype value that is the
 ///    self argument of the apply we will devirtualize.
 /// return the result value of the new ApplyInst if created one or null.
-FullApplySite swift::devirtualizeClassMethod(FullApplySite applySite,
-                                             SILValue classOrMetatype,
-                                             ClassDecl *cd,
-                                             OptRemark::Emitter *ore) {
+///
+/// Return the new apply and true if the CFG was also modified.
+std::pair<FullApplySite, bool>
+swift::devirtualizeClassMethod(FullApplySite applySite,
+                               SILValue classOrMetatype, ClassDecl *cd,
+                               OptRemark::Emitter *ore) {
   LLVM_DEBUG(llvm::dbgs() << "    Trying to devirtualize : "
                           << *applySite.getInstruction());
 
@@ -793,8 +798,10 @@ FullApplySite swift::devirtualizeClassMethod(FullApplySite applySite,
     newArgs.push_back(arg);
     ++paramArgIter;
   }
-  ApplySite newAS = replaceApplySite(builder, loc, applySite, fri, subs,
-                                     newArgs, substConv, newArgBorrows);
+  ApplySite newAS;
+  bool modifiedCFG;
+  std::tie(newAS, modifiedCFG) = replaceApplySite(
+      builder, loc, applySite, fri, subs, newArgs, substConv, newArgBorrows);
   FullApplySite newAI = FullApplySite::isa(newAS.getInstruction());
   assert(newAI);
 
@@ -808,16 +815,14 @@ FullApplySite swift::devirtualizeClassMethod(FullApplySite applySite,
     });
   NumClassDevirt++;
 
-  return newAI;
+  return {newAI, modifiedCFG};
 }
 
-FullApplySite swift::tryDevirtualizeClassMethod(FullApplySite applySite,
-                                                SILValue classInstance,
-                                                ClassDecl *cd,
-                                                OptRemark::Emitter *ore,
-                                                bool isEffectivelyFinalMethod) {
+std::pair<FullApplySite, bool> swift::tryDevirtualizeClassMethod(
+    FullApplySite applySite, SILValue classInstance, ClassDecl *cd,
+    OptRemark::Emitter *ore, bool isEffectivelyFinalMethod) {
   if (!canDevirtualizeClassMethod(applySite, cd, ore, isEffectivelyFinalMethod))
-    return FullApplySite();
+    return {FullApplySite(), false};
   return devirtualizeClassMethod(applySite, classInstance, cd, ore);
 }
 
@@ -960,9 +965,12 @@ swift::getWitnessMethodSubstitutions(SILModule &module, ApplySite applySite,
 /// Generate a new apply of a function_ref to replace an apply of a
 /// witness_method when we've determined the actual function we'll end
 /// up calling.
-static ApplySite devirtualizeWitnessMethod(ApplySite applySite, SILFunction *f,
-                                           ProtocolConformanceRef cRef,
-                                           OptRemark::Emitter *ore) {
+///
+/// Return the new apply and true if the CFG was also modified.
+static std::pair<ApplySite, bool>
+devirtualizeWitnessMethod(ApplySite applySite, SILFunction *f,
+                          ProtocolConformanceRef cRef,
+                          OptRemark::Emitter *ore) {
   // We know the witness thunk and the corresponding set of substitutions
   // required to invoke the protocol method at this point.
   auto &module = applySite.getModule();
@@ -1017,7 +1025,9 @@ static ApplySite devirtualizeWitnessMethod(ApplySite applySite, SILFunction *f,
   SILLocation loc = applySite.getLoc();
   auto *fri = applyBuilder.createFunctionRefFor(loc, f);
 
-  ApplySite newApplySite =
+  ApplySite newApplySite;
+  bool modifiedCFG;
+  std::tie(newApplySite, modifiedCFG) =
       replaceApplySite(applyBuilder, loc, applySite, fri, subMap, arguments,
                        substConv, borrowedArgs);
 
@@ -1029,7 +1039,7 @@ static ApplySite devirtualizeWitnessMethod(ApplySite applySite, SILFunction *f,
              << "Devirtualized call to " << NV("Method", f);
     });
   NumWitnessDevirt++;
-  return newApplySite;
+  return {newApplySite, modifiedCFG};
 }
 
 static bool canDevirtualizeWitnessMethod(ApplySite applySite) {
@@ -1066,10 +1076,11 @@ static bool canDevirtualizeWitnessMethod(ApplySite applySite) {
 /// In the cases where we can statically determine the function that
 /// we'll call to, replace an apply of a witness_method with an apply
 /// of a function_ref, returning the new apply.
-ApplySite swift::tryDevirtualizeWitnessMethod(ApplySite applySite,
-                                              OptRemark::Emitter *ore) {
+std::pair<ApplySite, bool>
+swift::tryDevirtualizeWitnessMethod(ApplySite applySite,
+                                    OptRemark::Emitter *ore) {
   if (!canDevirtualizeWitnessMethod(applySite))
-    return ApplySite();
+    return {ApplySite(), false};
 
   SILFunction *f;
   SILWitnessTable *wt;
@@ -1088,9 +1099,11 @@ ApplySite swift::tryDevirtualizeWitnessMethod(ApplySite applySite,
 
 /// Attempt to devirtualize the given apply if possible, and return a
 /// new instruction in that case, or nullptr otherwise.
-ApplySite swift::tryDevirtualizeApply(ApplySite applySite,
-                                      ClassHierarchyAnalysis *cha,
-                                      OptRemark::Emitter *ore) {
+///
+/// Return the new apply and true if the CFG was also modified.
+std::pair<ApplySite, bool>
+swift::tryDevirtualizeApply(ApplySite applySite, ClassHierarchyAnalysis *cha,
+                            OptRemark::Emitter *ore) {
   LLVM_DEBUG(llvm::dbgs() << "    Trying to devirtualize: "
                           << *applySite.getInstruction());
 
@@ -1105,7 +1118,7 @@ ApplySite swift::tryDevirtualizeApply(ApplySite applySite,
   // TODO: check if we can also de-virtualize partial applies of class methods.
   FullApplySite fas = FullApplySite::isa(applySite.getInstruction());
   if (!fas)
-    return ApplySite();
+    return {ApplySite(), false};
 
   /// Optimize a class_method and alloc_ref pair into a direct function
   /// reference:
@@ -1151,7 +1164,7 @@ ApplySite swift::tryDevirtualizeApply(ApplySite applySite,
     return tryDevirtualizeClassMethod(fas, instance, cd, ore);
   }
 
-  return ApplySite();
+  return {ApplySite(), false};
 }
 
 bool swift::canDevirtualizeApply(FullApplySite applySite,
diff --git a/lib/SILOptimizer/Utils/PerformanceInlinerUtils.cpp b/lib/SILOptimizer/Utils/PerformanceInlinerUtils.cpp
index e5dee815a58c8..03a0f80647e20 100644
--- a/lib/SILOptimizer/Utils/PerformanceInlinerUtils.cpp
+++ b/lib/SILOptimizer/Utils/PerformanceInlinerUtils.cpp
@@ -13,6 +13,12 @@
 #include "swift/SILOptimizer/Utils/PerformanceInlinerUtils.h"
 #include "swift/AST/Module.h"
 #include "swift/SILOptimizer/Utils/InstOptUtils.h"
+#include "llvm/Support/CommandLine.h"
+
+llvm::cl::opt<std::string>
+    SILInlineNeverFuns("sil-inline-never-functions", llvm::cl::init(""),
+                       llvm::cl::desc("Never inline functions whose name "
+                                      "includes this string."));
 
 //===----------------------------------------------------------------------===//
 //                               ConstantTracker
@@ -556,30 +562,23 @@ static bool calleeIsSelfRecursive(SILFunction *Callee) {
   return false;
 }
 
-// Returns true if a given apply site should be skipped during the
-// early inlining pass.
-//
-// NOTE: Add here the checks for any specific @_semantics/@_effects
-// attributes causing a given callee to be excluded from the inlining
-// during the early inlining pass.
-static bool shouldSkipApplyDuringEarlyInlining(FullApplySite AI) {
-  // Add here the checks for any specific @_semantics attributes that need
-  // to be skipped during the early inlining pass.
-  ArraySemanticsCall ASC(AI.getInstruction());
-  if (ASC && !ASC.canInlineEarly())
-    return true;
-
-  SILFunction *Callee = AI.getReferencedFunctionOrNull();
-  if (!Callee)
-    return false;
-
-  if (Callee->hasSemanticsAttr("self_no_escaping_closure") ||
-      Callee->hasSemanticsAttr("pair_no_escaping_closure"))
-    return true;
-
-  // Add here the checks for any specific @_effects attributes that need
-  // to be skipped during the early inlining pass.
-  if (Callee->hasEffectsKind())
+// Returns true if a given function has recognized @_semantics, and should
+// have inlining deferred.
+bool swift::isOptimizableSemanticFunction(SILFunction *callee) {
+
+  // Currently, we only consider "array" semantic calls to be "semantic
+  // functions" because we only have semantic passes that recognize array
+  // operations. In the future, any semantic call that represents a data
+  // structure "primitive" should return true here.
+  auto arrayCallKind = getArraySemanticsKind(callee);
+  if (arrayCallKind != ArrayCallKind::kNone) {
+    // @_semantics("array.uninitialized_intrinsic") is not treated like other
+    // array semantic calls because it is a compiler intrinsic that hides the
+    // "normal" semantic method @_semantics("array.uninitialized")--it should be
+    // inlined away immediately.
+    return arrayCallKind != ArrayCallKind::kArrayUninitializedIntrinsic;
+  }
+  if (callee->hasSemanticsAttr("inline_late"))
     return true;
 
   return false;
@@ -625,51 +624,18 @@ static bool isCallerAndCalleeLayoutConstraintsCompatible(FullApplySite AI) {
 }
 
 // Returns the callee of an apply_inst if it is basically inlinable.
-SILFunction *swift::getEligibleFunction(FullApplySite AI,
-                                        InlineSelection WhatToInline) {
+SILFunction *swift::getEligibleFunction(
+    FullApplySite AI, InlineSelection WhatToInline,
+    SmallPtrSetImpl<SILFunction *> &nestedSemanticFunctions) {
   SILFunction *Callee = AI.getReferencedFunctionOrNull();
 
-  if (!Callee) {
+  if (!Callee)
     return nullptr;
-  }
 
   // Not all apply sites can be inlined, even if they're direct.
   if (!SILInliner::canInlineApplySite(AI))
     return nullptr;
 
-  ModuleDecl *SwiftModule = Callee->getModule().getSwiftModule();
-  bool IsInStdlib = (SwiftModule->isStdlibModule() ||
-                     SwiftModule->isOnoneSupportModule());
-
-  // Don't inline functions that are marked with the @_semantics or @_effects
-  // attribute if the inliner is asked not to inline them.
-  if (Callee->hasSemanticsAttrs() || Callee->hasEffectsKind()) {
-    if (WhatToInline == InlineSelection::NoSemanticsAndGlobalInit) {
-      if (shouldSkipApplyDuringEarlyInlining(AI))
-        return nullptr;
-      if (Callee->hasSemanticsAttr("inline_late"))
-        return nullptr;
-    }
-    // The "availability" semantics attribute is treated like global-init.
-    if (Callee->hasSemanticsAttrs() &&
-        WhatToInline != InlineSelection::Everything &&
-        (Callee->hasSemanticsAttrThatStartsWith("availability") ||
-         (Callee->hasSemanticsAttrThatStartsWith("inline_late")))) {
-      return nullptr;
-    }
-    if (Callee->hasSemanticsAttrs() &&
-        WhatToInline == InlineSelection::Everything) {
-      if (Callee->hasSemanticsAttrThatStartsWith("inline_late") && IsInStdlib) {
-        return nullptr;
-      }
-    }
-
-  } else if (Callee->isGlobalInit()) {
-    if (WhatToInline != InlineSelection::Everything) {
-      return nullptr;
-    }
-  }
-
   // We can't inline external declarations.
   if (Callee->empty() || Callee->isExternalDeclaration()) {
     return nullptr;
@@ -680,10 +646,57 @@ SILFunction *swift::getEligibleFunction(FullApplySite AI,
     return nullptr;
   }
 
+  if (!SILInlineNeverFuns.empty()
+      && Callee->getName().find(SILInlineNeverFuns, 0) != StringRef::npos)
+    return nullptr;
+
   if (!Callee->shouldOptimize()) {
     return nullptr;
   }
 
+  switch (WhatToInline) {
+  case InlineSelection::PreModuleSerialization:
+    // Correctness: don't inline availability checks before serialization.
+    if (Callee->hasSemanticsAttrThatStartsWith("availability")
+        || Callee->hasSemanticsAttrThatStartsWith("inline_late")
+        || isOptimizableSemanticFunction(Callee) || Callee->hasEffectsKind()
+        || Callee->isGlobalInit()) {
+      return nullptr;
+    }
+    break;
+  case InlineSelection::RetainSemantics:
+    if (isOptimizableSemanticFunction(Callee)) {
+      // Avoid inlining the lowest level of semantic call. Doing so will
+      // pessimize analyses such as EscapeAnlysis and SideEffectAnalysis by
+      // exposing underlying ADT guts.
+      if (!nestedSemanticFunctions.count(Callee))
+        return nullptr;
+
+      // Avoid inlining a semantic call into a semantic function. It hides the
+      // underlying semantics from semantic passes. First, the outer semantic
+      // call must be inlined. Then a full round of semantic passes must rerun
+      // (all array optimizations). Afterward, the next level of semantic calls
+      // can be inlined. This relies on no unannotated functions on the call
+      // stack between the outer and inner semantic functions.
+      if (!isOptimizableSemanticFunction(AI.getFunction()))
+        return nullptr;
+    }
+    // Avoid inlining global initializers until GlobalOpt runs after all the
+    // mid-level function passes.
+    if (Callee->isGlobalInit())
+      return nullptr;
+
+    break;
+  case InlineSelection::Everything: {
+    ModuleDecl *SwiftModule = Callee->getModule().getSwiftModule();
+    bool IsInStdlib =
+        (SwiftModule->isStdlibModule() || SwiftModule->isOnoneSupportModule());
+    if (Callee->hasSemanticsAttrThatStartsWith("inline_late") && IsInStdlib)
+      return nullptr;
+    break;
+  }
+  }
+
   SILFunction *Caller = AI.getFunction();
 
   // We don't support inlining a function that binds dynamic self because we
@@ -721,6 +734,8 @@ SILFunction *swift::getEligibleFunction(FullApplySite AI,
   // Inlining self-recursive functions into other functions can result
   // in excessive code duplication since we run the inliner multiple
   // times in our pipeline
+  //
+  // FIXME: This should be cached!
   if (calleeIsSelfRecursive(Callee)) {
     return nullptr;
   }
diff --git a/stdlib/public/core/Array.swift b/stdlib/public/core/Array.swift
index bec36482144c3..032eec13cb84b 100644
--- a/stdlib/public/core/Array.swift
+++ b/stdlib/public/core/Array.swift
@@ -1027,6 +1027,7 @@ extension Array: RangeReplaceableCollection {
   /// the new capacity is calculated using `_growArrayCapacity`, but at least
   /// kept at `minimumCapacity`.
   @_alwaysEmitIntoClient
+  @_semantics("array.mutate_unknown")
   internal mutating func _reserveCapacityImpl(
     minimumCapacity: Int, growForAppend: Bool
   ) {
diff --git a/stdlib/public/core/ContiguousArray.swift b/stdlib/public/core/ContiguousArray.swift
index 13275086d2f26..bce9439a9397d 100644
--- a/stdlib/public/core/ContiguousArray.swift
+++ b/stdlib/public/core/ContiguousArray.swift
@@ -664,6 +664,7 @@ extension ContiguousArray: RangeReplaceableCollection {
   /// If a new buffer needs to be allocated and `growForAppend` is true,
   /// the new capacity is calculated using `_growArrayCapacity`.
   @_alwaysEmitIntoClient
+  @_semantics("array.mutate_unknown")
   internal mutating func _reserveCapacityImpl(
     minimumCapacity: Int, growForAppend: Bool
   ) {
diff --git a/test/SILOptimizer/array_contentof_opt.swift b/test/SILOptimizer/array_contentof_opt.swift
index aa218c2c6c88f..508c01135c89a 100644
--- a/test/SILOptimizer/array_contentof_opt.swift
+++ b/test/SILOptimizer/array_contentof_opt.swift
@@ -1,7 +1,16 @@
-// RUN: %target-swift-frontend -O -sil-verify-all -emit-sil %s | %FileCheck %s
+// RUN: %target-swift-frontend -O -sil-verify-all -emit-sil -Xllvm '-sil-inline-never-functions=$sSa6append' %s | %FileCheck %s
 // REQUIRES: swift_stdlib_no_asserts,optimized_stdlib
 
-// This is an end-to-end test of the array(contentsOf) -> array(Element) optimization
+// This is an end-to-end test of the Array.append(contentsOf:) ->
+// Array.append(Element) optimization.
+//
+// To check that the optimization produces the expected
+// Array.append(Element) calls, the CHECK lines match those call
+// sites. The optimizer may subsequently inline Array.append(Element),
+// which is good, but to keep the test simple and specific to the
+// optimization, the RUN line prevents inlining Array.append(Element).
+// Likewise, negative test check for the existence of
+// Array.append(contentsOf:), so don't inline those either.
 
 // CHECK-LABEL: sil @{{.*}}testInt
 // CHECK-NOT: apply
@@ -22,14 +31,13 @@ public func testInt(_ a: inout [Int]) {
 // CHECK-DAG:    apply [[F]]
 // CHECK-DAG:    apply [[F]]
 // CHECK:      } // end sil function '{{.*}}testThreeInts{{.*}}'
-
 public func testThreeInts(_ a: inout [Int]) {
   a += [1, 2, 3]
 }
 
 // CHECK-LABEL: sil @{{.*}}testTooManyInts
 // CHECK-NOT: apply
-// CHECK:        [[F:%[0-9]+]] = function_ref  @$sSa6append10contentsOfyqd__n_t7ElementQyd__RszSTRd__lFSi_SaySiGTg5Tf4gn_n
+// CHECK:        [[F:%[0-9]+]] = function_ref  @$sSa6append10contentsOfyqd__n_t7ElementQyd__RszSTRd__lFSi_SaySiGTg5
 // CHECK-NOT: apply
 // CHECK:        apply [[F]]
 // CHECK-NOT: apply
diff --git a/test/SILOptimizer/array_semantics_nested.swift b/test/SILOptimizer/array_semantics_nested.swift
new file mode 100644
index 0000000000000..67d0891814857
--- /dev/null
+++ b/test/SILOptimizer/array_semantics_nested.swift
@@ -0,0 +1,160 @@
+// RUN: %target-swift-frontend -O -emit-sil %s -Xllvm -debug-only=sil-inliner -Xllvm -debug-only=array-element-propagation -Xllvm -debug-only=cowarray-opts 2>&1 | %FileCheck %s
+
+// Test nested array semantic calls.
+//
+// FIXME: This test case is not yet fully optimized. See
+// <rdar://problem/59522579> Implement stable optimization of @_semantic calls
+//
+// The relevant sequence of passes is:
+//
+// - Early inlining does *not* inline Array.append(contentsOf:).
+//
+// - Early inlining inlines testInlineElts -> testInlineAppend.
+//
+// - ArrayElementPropagation of literal '[1, 2]' replaces
+// append(contentsOf:) with two calls to Array.append and removes the
+// temporary array literal.
+//
+// - Performance inlining does *not* initially inline any nested Array semantic
+// calls, like "_makeUniqueAndReserveCapacityIfNotUnique".
+//
+// - Performance inlining inlines Array.append(contentsOf:) and resets
+// the function pipeline.
+//
+// - COWArrayOpts hoists the call to _makeUniqueAndReserveCapacityIfNotUnique".
+
+// Here is the same sequence with interleaved CHECKs:
+
+// - Performance inlining does not initially inline any nested Array semantic
+// CHECK-NOT: inline [{{.*}}]] $sSa6append10contentsOfyqd__n_t7ElementQyd__RszSTRd__lF
+// CHECK-NOT: inline [{{.*}}] $sSa034_makeUniqueAndReserveCapacityIfNotB0yyFSi_Tg5
+
+// - Early inlining inlines testInlineElts -> testInlineAppend.
+// CHECK-LABEL: Inline into caller: $s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF
+// CHECK:       inline [{{.*}}] $s22array_semantics_nested14testInlineElts_4eltsySaySiGz_ADtF
+
+// CHECK-NOT: inline [{{.*}}]] $sSa6append10contentsOfyqd__n_t7ElementQyd__RszSTRd__lF
+// CHECK-NOT: inline [{{.*}}] $sSa034_makeUniqueAndReserveCapacityIfNotB0yyFSi_Tg5
+
+// - ArrayElementPropagation of literal '[1, 2]' replaces
+// append(contentsOf:) with two calls to Array.append and removes the
+// temporary array literal.
+// CHECK: Array append contentsOf calls replaced in $s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF
+
+// - Performance inlining does *not* initially inline any nested Array semantic
+// calls, like "_makeUniqueAndReserveCapacityIfNotUnique".
+
+// CHECK-NOT: inline [{{.*}}]] $sSa6append10contentsOfyqd__n_t7ElementQyd__RszSTRd__lF
+// CHECK-NOT: inline [{{.*}}] $sSa034_makeUniqueAndReserveCapacityIfNotB0yyFSi_Tg5
+
+// - Performance inlining inlines Array.append(Element) and resets
+// the function pipeline.
+
+// CHECK: Inline into caller: $s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF
+// CHECK: inline [{{.*}}] $sSa6appendyyxnFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa6appendyyxnFSi_Tg5
+
+// CHECK: COW Array Opts in Func $s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF
+// CHECK:   Array Opts in Loop Loop at depth 1 containing:
+// CHECK:     Checking mutable array:   %{{.*}} = alloc_stack $Array<Int>, var, name "result"
+//
+// FIXME: Uniqueness check hoisting has not worked since Array methods
+// were refactored for code size. To fix this, reserveCapacityForAppend needs
+// to be implemented in terms of its lower level semantics calls.
+// <rdar://problem/59522579> Implement stable optimization of @_semantic calls
+//
+// CHECK-TODO: Hoisting make_mutable:
+// CHECK-TODO: Removing make_mutable call:
+
+// CHECK-NOT: Inline into caller
+
+// - The next round of inlinling in the same function pipeline.
+// CHECK: Inline into caller: $s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF
+// CHECK: inline [{{.*}}] $sSa034_makeUniqueAndReserveCapacityIfNotB0yyFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa9_getCountSiyFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa12_getCapacitySiyFSi_Tg5
+// CHECK-TODO: inline [{{.*}}] $sSa15reserveCapacityyySiFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa9_getCountSiyFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa36_reserveCapacityAssumingUniqueBuffer8oldCountySi_tFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa37_appendElementAssumeUniqueAndCapacity_03newB0ySi_xntFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa9_getCountSiyFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa36_reserveCapacityAssumingUniqueBuffer8oldCountySi_tFSi_Tg5
+// CHECK: inline [{{.*}}] $sSa37_appendElementAssumeUniqueAndCapacity_03newB0ySi_xntFSi_Tg5
+
+// This helper ensures that at least one round of inlining is needed
+// *before* inlining Array.append.
+func testInlineElts(_ a: inout [Int], elts: [Int]) -> () {
+  a.append(contentsOf: elts)
+}
+
+// CHECK-LABEL: sil @$s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF : $@convention(thin) (Int) -> @owned Array<Int> {
+// CHECK: bb0(%0 : $Int):
+// Initialize the array...
+// CHECK:   [[RESULTARRAY:%[0-9]+]] = alloc_stack $Array<Int>, var, name "result"
+// CHECK:   store %{{.*}} to [[RESULTARRAY]] : $*Array<Int>
+
+// Perform the uniqueness check... (FIXME: should be able to optimize this away since it hasn't been escaped)
+// CHECK:   [[BUFADR:%[0-9]+]] = struct_element_addr [[RESULTARRAY]] : $*Array<Int>, #Array._buffer
+// CHECK:   [[STORADR:%[0-9]+]] = struct_element_addr [[BUFADR]] : $*_ArrayBuffer<Int>, #_ArrayBuffer._storage
+// CHECK:   [[BRIDGE:%[0-9]+]] = struct_element_addr [[STORADR]] : $*_BridgeStorage<__ContiguousArrayStorageBase>, #_BridgeStorage.rawValue
+// CHECK:   [[NATIVE:%[0-9]+]] = unchecked_addr_cast [[BRIDGE]] : $*Builtin.BridgeObject to $*Builtin.NativeObject
+// CHECK:   [[UNIQ:%[0-9]+]] = is_unique [[NATIVE]] : $*Builtin.NativeObject
+// CHECK-TODO:   [[EXPECT:%[0-9]+]] = builtin "int_expect_Int1"([[UNIQ]] : $Builtin.Int1
+// CHECK-TODO:   cond_br [[EXPECT]]
+// CHECK:   cond_br [[UNIQ]]
+
+// Enter the loop...
+// CHECK-TODO: [[LOOPBB:bb[0-9]+]](%{{.*}} : $Builtin.Int64): // Preds: [[TAILBB:bb[0-9]+]] bb
+
+// FIXME: All calls should be removed from the loop
+// CHECK-NOT-TODO: apply
+
+// Reserve capacity...
+//
+// FIXME: There is still an is_uniq inside the loop because of reserveCapacity.
+// Either reserveCapacity should be split into a hoistable uniqueness check,
+// or we should be able to prove that the uniqueness check is dominated with no escape.
+// CHECK:   [[BRIDGEOBJ:%.*]] = load [[BRIDGE]] : $*Builtin.BridgeObject
+// CHECK:   struct_element_addr %{{.*}} : $*_SwiftArrayBodyStorage, #_SwiftArrayBodyStorage.count
+// CHECK:   load %{{.*}} : $*Builtin.Int64
+// CHECK:   struct_element_addr %{{.*}} : $*_SwiftArrayBodyStorage, #_SwiftArrayBodyStorage._capacityAndFlags
+// CHECK:   load %{{.*}} : $*Builtin.Int64
+// CHECK-TODO:   cond_br %{{.*}}, bb8, bb9
+// CHECK:   cond_br %{{.*}}
+// CHECK-NOT: apply
+// CHECK: bb
+// CHECK:   is_unique %{{.*}} : $*Builtin.NativeObject
+// CHECK:   cond_br
+
+// CHECK-NOT-TODO: apply
+
+// CHECK-TODO:   [[SZF:%[0-9]+]] = function_ref @_swift_stdlib_malloc_size : $@convention(c) (UnsafeRawPointer) -> Int
+// CHECK-TODO:   apply [[SZF]](%{{.*}}) : $@convention(c) (UnsafeRawPointer) -> Int
+
+// CHECK-NOT-TODO: apply
+
+// CHECK-TODO: builtin "copyArray"<Int>
+// CHECK-TODO: store %{{.*}} to [[RESULTARRAY]] : $*Array<Int>
+
+// CHECK-NOT-TODO: apply
+
+// CHECK-TODO:   [[CPF1:%.*]] = function_ref @$sSa16_copyToNewBuffer8oldCountySi_tFSi_Tg5 : $@convention(method) (Int, @inout Array<Int>) -> ()
+// CHECK-TODO:   apply [[CPF1]](%{{.*}}, [[RESULTARRAY]]) : $@convention(method) (Int, @inout Array<Int>) -> ()
+
+// CHECK-NOT-TODO: apply
+
+// CHECK-TODO:   [[CPF2:%.*]] = function_ref @$sSa16_copyToNewBuffer8oldCountySi_tFSi_Tg5 : $@convention(method) (Int, @inout Array<Int>) -> ()
+// CHECK-TODO:   apply [[CPF2]](%{{.*}}, [[RESULTARRAY]]) : $@convention(method) (Int, @inout Array<Int>) -> ()
+
+// CHECK-NOT-TODO: apply
+
+// CHECK-TODO: br [[LOOPBB]](%{{.*}} : $Builtin.Int64)
+// CHECK-LABEL: } // end sil function '$s22array_semantics_nested16testInlineAppend5countSaySiGSi_tF'
+
+public func testInlineAppend(count: Int) -> [Int] {
+  var result = Array<Int>()
+  for _ in 0..<count {
+    testInlineElts(&result, elts: [1, 2])
+  }
+  return result
+}
diff --git a/test/SILOptimizer/generic_specialization_loops_detection_with_loops.swift b/test/SILOptimizer/generic_specialization_loops_detection_with_loops.swift
index 30d8f319fbf2a..9ec48f4ce28ba 100644
--- a/test/SILOptimizer/generic_specialization_loops_detection_with_loops.swift
+++ b/test/SILOptimizer/generic_specialization_loops_detection_with_loops.swift
@@ -21,7 +21,10 @@
 
 // Check that the compiler has produced a specialization information for a call-site that
 // was inlined from a specialized generic function.
-// CHECK-LABEL: // Generic specialization information for call-site $s044generic_specialization_loops_detection_with_C04foo4yyx_q_tr0_lFSaySays5UInt8VGG_SaySaySiGGTg5:
+//
+// Currently, bar4<Int, Double> is inlined into foo4<Int, Double>.
+// CHECK-LABEL: sil shared [noinline] @$s044generic_specialization_loops_detection_with_C04foo4yyx_q_tr0_lFSi_SdTg5 : $@convention(thin) (Int, Double) -> () {
+// CHECK:       // Generic specialization information for call-site $s044generic_specialization_loops_detection_with_C04foo4yyx_q_tr0_lFSaySays5UInt8VGG_SaySaySiGGTg5:
 // CHECK-NEXT:  // Caller: $s044generic_specialization_loops_detection_with_C04foo4yyx_q_tr0_lFSi_SdTg5
 // CHECK-NEXT:  // Parent: $s044generic_specialization_loops_detection_with_C04bar4yyx_q_tr0_lF
 // CHECK-NEXT:  // Substitutions: <Array<UInt8>, Array<Int>>
@@ -31,6 +34,7 @@
 // CHECK-NEXT:  // Substitutions: <Int, Double>
 // CHECK-NEXT:  //
 // CHECK-NEXT: apply %{{.*}}Array<Array<UInt8>>
+// CHECK-LABEL: } // end sil function '$s044generic_specialization_loops_detection_with_C04foo4yyx_q_tr0_lFSi_SdTg5'
 
 // Check specializations of mutually recursive functions which
 // may result in an infinite specialization loop.
@@ -52,6 +56,11 @@ public func testFooBar3() {
 // Check specializations of mutually recursive functions which
 // may result in an infinite specialization loop.
 public var g = 0
+
+// Don't inline foo4 just so we can reliably check for specialization
+// information both at the function and call-site level.
+// bar4 is still inlined so we can test for inlined specialization info.
+@inline(never)
 func foo4<T, S>(_ t: T, _ s: S) {
   // Here we have multiple call-sites of the same generic
   // functions inside the same caller.
diff --git a/test/SILOptimizer/merge_exclusivity.swift b/test/SILOptimizer/merge_exclusivity.swift
index 986c69d1d7525..409c4e71e6445 100644
--- a/test/SILOptimizer/merge_exclusivity.swift
+++ b/test/SILOptimizer/merge_exclusivity.swift
@@ -304,18 +304,22 @@ public final class StreamClass {
         self.buffer = []
     }
 
+    @inline(__always)
     public func write(_ byte: UInt8) {
         buffer.append(byte)
     }
 
+    @inline(__always)
     public func write(_ value: WriteProt) {
         value.writeTo(self)
     }
 
+    @inline(__always)
     public func writeEscaped(_ string: String) {
         writeEscaped(string: string.utf8)
     }
     
+    @inline(__always)
     public func writeEscaped<T: Collection>(
         string sequence: T
     ) where T.Iterator.Element == UInt8 {
@@ -326,12 +330,14 @@ public final class StreamClass {
     }
 }
 
+@inline(__always)
 public func toStream(_ stream: StreamClass, _ value: WriteProt) -> StreamClass {
     stream.write(value)
     return stream
 }
 
 extension UInt8: WriteProt {
+    @inline(__always)
     public func writeTo(_ stream: StreamClass) {
         stream.write(self)
     }
@@ -344,6 +350,7 @@ public func asWriteProt(_ string: String) -> WriteProt {
 private struct EscapedString: WriteProt {
     let value: String
         
+    @inline(__always)
     func writeTo(_ stream: StreamClass) {
         _ = toStream(stream, UInt8(ascii: "a"))
         stream.writeEscaped(value)
@@ -359,6 +366,7 @@ private struct EscapedTransforme<T>: WriteProt {
     let items: [T]
     let transform: (T) -> String
 
+    @inline(__always)
     func writeTo(_ stream: StreamClass) {
         for (i, item) in items.enumerated() {
             if i != 0 { _ = toStream(stream, asWriteProt(transform(item))) }
@@ -388,6 +396,6 @@ public func run_MergeTest9(_ N: Int) {
   let listOfThings: [Thing] = listOfStrings.map(Thing.init)
   for _ in 1...N {
     let stream = StreamClass()
-      _ = toStream(stream, asWriteProt(listOfThings, transform: { $0.value }))
+    _ = toStream(stream, asWriteProt(listOfThings, transform: { $0.value }))
   }
 }
diff --git a/utils/swift-autocomplete.bash b/utils/swift-autocomplete.bash
index 7e48fd7d5fa85..d79094d0c64c4 100644
--- a/utils/swift-autocomplete.bash
+++ b/utils/swift-autocomplete.bash
@@ -68,6 +68,7 @@ _swift_complete()
       -sil-verify-without-invalidation \
       -sil-inline-test-threshold \
       -sil-inline-test \
+      -sil-inline-never-functions \
       -sroa-args-remove-dead-args-after \
       -ml \
       -sil-print-escapes \