diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index 36af6c98d18b..c1773258554c 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -134,7 +134,8 @@ ClangdServer::ClangdServer(const GlobalCompilationDatabase &CDB,
                      : nullptr),
       GetClangTidyOptions(Opts.GetClangTidyOptions),
       SuggestMissingIncludes(Opts.SuggestMissingIncludes),
-      TweakFilter(Opts.TweakFilter), WorkspaceRoot(Opts.WorkspaceRoot),
+      BuildRecoveryAST(Opts.BuildRecoveryAST), TweakFilter(Opts.TweakFilter),
+      WorkspaceRoot(Opts.WorkspaceRoot),
       // Pass a callback into `WorkScheduler` to extract symbols from a newly
       // parsed file and rebuild the file index synchronously each time an AST
       // is parsed.
@@ -191,6 +192,7 @@ void ClangdServer::addDocument(PathRef File, llvm::StringRef Contents,
   Inputs.ForceRebuild = ForceRebuild;
   Inputs.Opts = std::move(Opts);
   Inputs.Index = Index;
+  Inputs.Opts.BuildRecoveryAST = BuildRecoveryAST;
   bool NewFile = WorkScheduler.update(File, Inputs, WantDiags);
   // If we loaded Foo.h, we want to make sure Foo.cpp is indexed.
   if (NewFile && BackgroundIdx)
@@ -269,9 +271,13 @@ void ClangdServer::signatureHelp(PathRef File, Position Pos,
     if (!IP)
       return CB(IP.takeError());
 
-    auto PreambleData = IP->Preamble;
-    CB(clangd::signatureHelp(File, IP->Command, PreambleData, IP->Contents, Pos,
-                             FS, Index));
+    const auto *PreambleData = IP->Preamble;
+    if (!PreambleData)
+      return CB(llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                        "Failed to parse includes"));
+
+    CB(clangd::signatureHelp(File, IP->Command, *PreambleData, IP->Contents,
+                             Pos, FS, Index));
   };
 
   // Unlike code completion, we wait for an up-to-date preamble here.
diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
index a0659c7c3d22..f1e981e6c14f 100644
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -118,6 +118,9 @@ class ClangdServer {
     /// enabled.
     ClangTidyOptionsBuilder GetClangTidyOptions;
 
+    /// If true, turn on the `-frecovery-ast` clang flag.
+    bool BuildRecoveryAST = false;
+
     /// Clangd's workspace root. Relevant for "workspace" operations not bound
     /// to a particular file.
     /// FIXME: If not set, should use the current working directory.
@@ -345,6 +348,9 @@ class ClangdServer {
   // can be caused by missing includes (e.g. member access in incomplete type).
   bool SuggestMissingIncludes = false;
 
+  // If true, preserve expressions in AST for broken code.
+  bool BuildRecoveryAST = false;
+
   std::function<bool(const Tweak &)> TweakFilter;
 
   // GUARDED_BY(CachedCompletionFuzzyFindRequestMutex)
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 344b90ecaa32..b544510ecea1 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1022,7 +1022,7 @@ class SignatureHelpCollector final : public CodeCompleteConsumer {
 struct SemaCompleteInput {
   PathRef FileName;
   const tooling::CompileCommand &Command;
-  const PreambleData *Preamble;
+  const PreambleData &Preamble;
   llvm::StringRef Contents;
   size_t Offset;
   llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS;
@@ -1054,8 +1054,8 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
                       IncludeStructure *Includes = nullptr) {
   trace::Span Tracer("Sema completion");
   llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = Input.VFS;
-  if (Input.Preamble && Input.Preamble->StatCache)
-    VFS = Input.Preamble->StatCache->getConsumingFS(std::move(VFS));
+  if (Input.Preamble.StatCache)
+    VFS = Input.Preamble.StatCache->getConsumingFS(std::move(VFS));
   ParseInputs ParseInput;
   ParseInput.CompileCommand = Input.Command;
   ParseInput.FS = VFS;
@@ -1072,6 +1072,10 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
   FrontendOpts.SkipFunctionBodies = true;
   // Disable typo correction in Sema.
   CI->getLangOpts()->SpellChecking = false;
+  // Code completion won't trigger in delayed template bodies.
+  // This is on-by-default in windows to allow parsing SDK headers; we're only
+  // disabling it for the main-file (not preamble).
+  CI->getLangOpts()->DelayedTemplateParsing = false;
   // Setup code completion.
   FrontendOpts.CodeCompleteOpts = Options;
   FrontendOpts.CodeCompletionAt.FileName = std::string(Input.FileName);
@@ -1095,9 +1099,7 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
   // NOTE: we must call BeginSourceFile after prepareCompilerInstance. Otherwise
   // the remapped buffers do not get freed.
   auto Clang = prepareCompilerInstance(
-      std::move(CI),
-      (Input.Preamble && !CompletingInPreamble) ? &Input.Preamble->Preamble
-                                                : nullptr,
+      std::move(CI), !CompletingInPreamble ? &Input.Preamble.Preamble : nullptr,
       std::move(ContentsBuffer), std::move(VFS), IgnoreDiags);
   Clang->getPreprocessorOpts().SingleFileParseMode = CompletingInPreamble;
   Clang->setCodeCompletionConsumer(Consumer.release());
@@ -1114,8 +1116,7 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
   //  - but Sema code complete won't see them: as part of the preamble, they're
   //    deserialized only when mentioned.
   // Force them to be deserialized so SemaCodeComplete sees them.
-  if (Input.Preamble)
-    loadMainFilePreambleMacros(Clang->getPreprocessor(), *Input.Preamble);
+  loadMainFilePreambleMacros(Clang->getPreprocessor(), Input.Preamble);
   if (Includes)
     Clang->getPreprocessor().addPPCallbacks(
         collectIncludeStructureCallback(Clang->getSourceManager(), Includes));
@@ -1754,12 +1755,12 @@ codeComplete(PathRef FileName, const tooling::CompileCommand &Command,
   return (!Preamble || Opts.RunParser == CodeCompleteOptions::NeverParse)
              ? std::move(Flow).runWithoutSema(Contents, *Offset, VFS)
              : std::move(Flow).run(
-                   {FileName, Command, Preamble, Contents, *Offset, VFS});
+                   {FileName, Command, *Preamble, Contents, *Offset, VFS});
 }
 
 SignatureHelp signatureHelp(PathRef FileName,
                             const tooling::CompileCommand &Command,
-                            const PreambleData *Preamble,
+                            const PreambleData &Preamble,
                             llvm::StringRef Contents, Position Pos,
                             llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
                             const SymbolIndex *Index) {
diff --git a/clang-tools-extra/clangd/CodeComplete.h b/clang-tools-extra/clangd/CodeComplete.h
index df06c156049f..3adea47c89a1 100644
--- a/clang-tools-extra/clangd/CodeComplete.h
+++ b/clang-tools-extra/clangd/CodeComplete.h
@@ -276,7 +276,7 @@ CodeCompleteResult codeComplete(PathRef FileName,
 /// Get signature help at a specified \p Pos in \p FileName.
 SignatureHelp signatureHelp(PathRef FileName,
                             const tooling::CompileCommand &Command,
-                            const PreambleData *Preamble, StringRef Contents,
+                            const PreambleData &Preamble, StringRef Contents,
                             Position Pos,
                             IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
                             const SymbolIndex *Index);
diff --git a/clang-tools-extra/clangd/Compiler.h b/clang-tools-extra/clangd/Compiler.h
index ef5386bb0d17..b7cc174455f3 100644
--- a/clang-tools-extra/clangd/Compiler.h
+++ b/clang-tools-extra/clangd/Compiler.h
@@ -38,6 +38,7 @@ class IgnoreDiagnostics : public DiagnosticConsumer {
 struct ParseOptions {
   tidy::ClangTidyOptions ClangTidyOpts;
   bool SuggestMissingIncludes = false;
+  bool BuildRecoveryAST = false;
 };
 
 /// Information required to run clang, e.g. to parse AST or do code completion.
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 1d6997f0b4d4..2c7cb5d2b85d 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -253,6 +253,10 @@ ParsedAST::build(llvm::StringRef Version,
   const PrecompiledPreamble *PreamblePCH =
       Preamble ? &Preamble->Preamble : nullptr;
 
+  // Recovery expression currently only works for C++.
+  if (CI->getLangOpts()->CPlusPlus)
+    CI->getLangOpts()->RecoveryAST = Opts.BuildRecoveryAST;
+
   StoreDiags ASTDiags;
   std::string Content = std::string(Buffer->getBuffer());
   std::string Filename =
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index fdee71fd2244..48f15420032f 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -132,6 +132,10 @@ buildPreamble(PathRef FileName, CompilerInvocation &CI,
   // to read back. We rely on dynamic index for the comments instead.
   CI.getPreprocessorOpts().WriteCommentListToPCH = false;
 
+  // Recovery expression currently only works for C++.
+  if (CI.getLangOpts()->CPlusPlus)
+    CI.getLangOpts()->RecoveryAST = Inputs.Opts.BuildRecoveryAST;
+
   CppFilePreambleCallbacks SerializedDeclsCollector(FileName, PreambleCallback);
   if (Inputs.FS->setCurrentWorkingDirectory(Inputs.CompileCommand.Directory)) {
     log("Couldn't set working directory when building the preamble.");
diff --git a/clang-tools-extra/clangd/SemanticHighlighting.cpp b/clang-tools-extra/clangd/SemanticHighlighting.cpp
index 77b2cbce40d9..59af922d4005 100644
--- a/clang-tools-extra/clangd/SemanticHighlighting.cpp
+++ b/clang-tools-extra/clangd/SemanticHighlighting.cpp
@@ -520,6 +520,7 @@ llvm::StringRef toSemanticTokenType(HighlightingKind Kind) {
   case HighlightingKind::InactiveCode:
     return "comment";
   }
+  llvm_unreachable("unhandled HighlightingKind");
 }
 
 std::vector<TheiaSemanticHighlightingInformation>
diff --git a/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/launch.json b/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/launch.json
index cd6b87bd05c0..7d414bc00f32 100644
--- a/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/launch.json
+++ b/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/launch.json
@@ -1,4 +1,4 @@
-// A launch configuration that compiles the extension and then opens it inside a new window
+// A launch configuration that compiles extension and opens it inside a new window.
 {
     "version": "0.1.0",
     "configurations": [
diff --git a/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/tasks.json b/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/tasks.json
index fb7f662e14d1..65b1c9598c0e 100644
--- a/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/tasks.json
+++ b/clang-tools-extra/clangd/clients/clangd-vscode/.vscode/tasks.json
@@ -6,25 +6,27 @@
 // ${fileExtname}: the current opened file's extension
 // ${cwd}: the current working directory of the spawned process
 
-// A task runner that calls a custom npm script that compiles the extension.
+// Task runner calls custom npm script to compile the extension.
 {
-    "version": "0.1.0",
+    "version": "2.0.0",
 
-    // we want to run npm
+    // Run NPM.
     "command": "npm",
 
-    // the command is a shell script
-    "isShellCommand": true,
+    // This command is a shell script.
+    "type": "shell",
 
     // show the output window only if unrecognized errors occur.
-    "showOutput": "silent",
+    "presentation": {
+        "reveal": "silent",
+    },
 
-    // we run the custom script "compile" as defined in package.json
+    // Run custom "compile" script as defined in package.json
     "args": ["run", "compile", "--loglevel", "silent"],
 
-    // The tsc compiler is started in watching mode
-    "isWatching": true,
+    // tsc compiler is kept alive and runs in the background.
+    "isBackground": true,
 
-    // use the standard tsc in watch mode problem matcher to find compile problems in the output.
+    // Find compilation problems in the output through tsc in watch mode.
     "problemMatcher": "$tsc-watch"
-}
\ No newline at end of file
+}
diff --git a/clang-tools-extra/clangd/clients/clangd-vscode/DEVELOPING.md b/clang-tools-extra/clangd/clients/clangd-vscode/DEVELOPING.md
index e888aba3ea20..15f2b930329e 100644
--- a/clang-tools-extra/clangd/clients/clangd-vscode/DEVELOPING.md
+++ b/clang-tools-extra/clangd/clients/clangd-vscode/DEVELOPING.md
@@ -10,20 +10,20 @@ A guide of developing `vscode-clangd` extension.
 ## Steps
 
 1. Make sure you disable the installed `vscode-clangd` extension in VS Code.
-2. Make sure you have clangd in /usr/bin/clangd or edit src/extension.ts to
+2. Make sure you have clangd in `/usr/bin/clangd` or edit `src/extension.ts` to
 point to the binary.
-3. In order to start a development instance of VS code extended with this, run:
+3. To start a development instance of VS code extended with this, run:
 
 ```bash
    $ cd /path/to/clang-tools-extra/clangd/clients/clangd-vscode/
    $ npm install
    $ code .
-   # When VS Code starts, press <F5>.
+   # When VSCode starts, press <F5>.
 ```
 
 # Contributing
 
-Please follow the exsiting code style when contributing to the extension, we
+Please follow the existing code style when contributing to the extension, we
 recommend to run `npm run format` before sending a patch.
 
 # Publish to VS Code Marketplace
@@ -38,15 +38,15 @@ to the marketplace.
 * Bump the version in `package.json`, and commit the change to upstream
 
 The extension is published under `llvm-vs-code-extensions` account, which is
-currently maintained by clangd developers. If you want to make a new release,
-please contact clangd-dev@lists.llvm.org.
+maintained by clangd developers. If you want to make a new release, please
+contact clangd-dev@lists.llvm.org.
 
 ## Steps
 
 ```bash
   $ cd /path/to/clang-tools-extra/clangd/clients/clangd-vscode/
-  # For the first time, you need to login in the account. vsce will ask you for
-    the Personal Access Token, and remember it for future commands.
+  # For the first time, you need to login into the account. vsce will ask you
+    for the Personal Access Token and will remember it for future commands.
   $ vsce login llvm-vs-code-extensions
   # Publish the extension to the VSCode marketplace.
   $ npm run publish
diff --git a/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts b/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts
index 4749cd1bb582..a7570b63e552 100644
--- a/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts
+++ b/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts
@@ -3,7 +3,7 @@ import * as vscodelc from 'vscode-languageclient';
 import * as semanticHighlighting from './semantic-highlighting';
 
 /**
- * Method to get workspace configuration option
+ * Get an option from workspace configuration.
  * @param option name of the option (e.g. for clangd.path should be path)
  * @param defaultValue default value to return if option is not set
  */
@@ -75,8 +75,8 @@ class EnableEditsNearCursorFeature implements vscodelc.StaticFeature {
 }
 
 /**
- *  this method is called when your extension is activate
- *  your extension is activated the very first time the command is executed
+ *  This method is called when the extension is activated. The extension is
+ *  activated the very first time a command is executed.
  */
 export function activate(context: vscode.ExtensionContext) {
   const syncFileEvents = getConfig<boolean>('syncFileEvents', true);
@@ -97,7 +97,7 @@ export function activate(context: vscode.ExtensionContext) {
         documentSelector: [
             { scheme: 'file', language: 'c' },
             { scheme: 'file', language: 'cpp' },
-            // cuda is not supported by vscode, but our extension does.
+            // CUDA is not supported by vscode, but our extension does supports it.
             { scheme: 'file', language: 'cuda' },
             { scheme: 'file', language: 'objective-c'},
             { scheme: 'file', language: 'objective-cpp'}
@@ -106,7 +106,7 @@ export function activate(context: vscode.ExtensionContext) {
         // FIXME: send sync file events when clangd provides implementations.
         },
         initializationOptions: { clangdFileStatus: true },
-        // Do not switch to output window when clangd returns output
+        // Do not switch to output window when clangd returns output.
         revealOutputChannelOn: vscodelc.RevealOutputChannelOn.Never,
 
         // We hack up the completion items a bit to prevent VSCode from re-ranking them
@@ -126,7 +126,7 @@ export function activate(context: vscode.ExtensionContext) {
           provideCompletionItem: async (document, position, context, token, next) => {
             let list = await next(document, position, context, token);
             let items = (Array.isArray(list) ? list : list.items).map(item => {
-              // Gets the prefix used by vscode when doing fuzzymatch.
+              // Gets the prefix used by VSCode when doing fuzzymatch.
               let prefix = document.getText(new vscode.Range(item.range.start, position))
               if (prefix)
                 item.filterText = prefix + "_" + item.filterText;
diff --git a/clang-tools-extra/clangd/clients/clangd-vscode/tsconfig.json b/clang-tools-extra/clangd/clients/clangd-vscode/tsconfig.json
index 0b05f3090920..71a62c71da02 100644
--- a/clang-tools-extra/clangd/clients/clangd-vscode/tsconfig.json
+++ b/clang-tools-extra/clangd/clients/clangd-vscode/tsconfig.json
@@ -26,4 +26,4 @@
         "node_modules",
         ".vscode-test"
     ]
-}
\ No newline at end of file
+}
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 7a7bb9b0718e..9bfc58b55f71 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -281,6 +281,15 @@ opt<bool> CrossFileRename{
     Hidden,
 };
 
+opt<bool> RecoveryAST{
+    "recovery-ast",
+    cat(Features),
+    desc("Preserve expressions in AST for broken code (C++ only). Note that "
+         "this feature is experimental and may lead to crashes"),
+    init(false),
+    Hidden,
+};
+
 opt<unsigned> WorkerThreadsCount{
     "j",
     cat(Misc),
@@ -629,6 +638,7 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
   }
   Opts.StaticIndex = StaticIdx.get();
   Opts.AsyncThreadsCount = WorkerThreadsCount;
+  Opts.BuildRecoveryAST = RecoveryAST;
 
   clangd::CodeCompleteOptions CCOpts;
   CCOpts.IncludeIneligibleResults = IncludeIneligibleResults;
diff --git a/clang-tools-extra/clangd/unittests/ClangdTests.cpp b/clang-tools-extra/clangd/unittests/ClangdTests.cpp
index 1e5fcf3d97e1..d15eba80ae29 100644
--- a/clang-tools-extra/clangd/unittests/ClangdTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ClangdTests.cpp
@@ -552,15 +552,13 @@ TEST_F(ClangdVFSTest, InvalidCompileCommand) {
   EXPECT_ERROR(runFindDocumentHighlights(Server, FooCpp, Position()));
   EXPECT_ERROR(runRename(Server, FooCpp, Position(), "new_name",
                          clangd::RenameOptions()));
+  EXPECT_ERROR(runSignatureHelp(Server, FooCpp, Position()));
   // Identifier-based fallback completion.
   EXPECT_THAT(cantFail(runCodeComplete(Server, FooCpp, Position(),
                                        clangd::CodeCompleteOptions()))
                   .Completions,
               ElementsAre(Field(&CodeCompletion::Name, "int"),
                           Field(&CodeCompletion::Name, "main")));
-  auto SigHelp = runSignatureHelp(Server, FooCpp, Position());
-  ASSERT_TRUE(bool(SigHelp)) << "signatureHelp returned an error";
-  EXPECT_THAT(SigHelp->signatures, IsEmpty());
 }
 
 class ClangdThreadingTest : public ClangdVFSTest {};
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index 24197485f68a..1084b1550579 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -93,8 +93,9 @@ std::unique_ptr<SymbolIndex> memIndex(std::vector<Symbol> Symbols) {
   return MemIndex::build(std::move(Slab).build(), RefSlab(), RelationSlab());
 }
 
-CodeCompleteResult completions(ClangdServer &Server, llvm::StringRef TestCode,
-                               Position Point,
+// Runs code completion.
+// If IndexSymbols is non-empty, an index will be built and passed to opts.
+CodeCompleteResult completions(const TestTU &TU, Position Point,
                                std::vector<Symbol> IndexSymbols = {},
                                clangd::CodeCompleteOptions Opts = {}) {
   std::unique_ptr<SymbolIndex> OverrideIndex;
@@ -104,49 +105,34 @@ CodeCompleteResult completions(ClangdServer &Server, llvm::StringRef TestCode,
     Opts.Index = OverrideIndex.get();
   }
 
-  auto File = testPath("foo.cpp");
-  runAddDocument(Server, File, TestCode);
-  auto CompletionList =
-      llvm::cantFail(runCodeComplete(Server, File, Point, Opts));
-  return CompletionList;
-}
-
-CodeCompleteResult completions(ClangdServer &Server, llvm::StringRef Text,
-                               std::vector<Symbol> IndexSymbols = {},
-                               clangd::CodeCompleteOptions Opts = {},
-                               PathRef FilePath = "foo.cpp") {
-  std::unique_ptr<SymbolIndex> OverrideIndex;
-  if (!IndexSymbols.empty()) {
-    assert(!Opts.Index && "both Index and IndexSymbols given!");
-    OverrideIndex = memIndex(std::move(IndexSymbols));
-    Opts.Index = OverrideIndex.get();
+  auto Inputs = TU.inputs();
+  IgnoreDiagnostics Diags;
+  auto CI = buildCompilerInvocation(Inputs, Diags);
+  if (!CI) {
+    ADD_FAILURE() << "Couldn't build CompilerInvocation";
+    return {};
   }
-
-  auto File = testPath(FilePath);
-  Annotations Test(Text);
-  runAddDocument(Server, File, Test.code());
-  auto CompletionList =
-      llvm::cantFail(runCodeComplete(Server, File, Test.point(), Opts));
-  return CompletionList;
+  auto Preamble =
+      buildPreamble(testPath(TU.Filename), *CI, /*OldPreamble=*/nullptr, Inputs,
+                    /*InMemory=*/true, /*Callback=*/nullptr);
+  return codeComplete(testPath(TU.Filename), Inputs.CompileCommand,
+                      Preamble.get(), TU.Code, Point, Inputs.FS, Opts);
 }
 
-// Builds a server and runs code completion.
-// If IndexSymbols is non-empty, an index will be built and passed to opts.
+// Runs code completion.
 CodeCompleteResult completions(llvm::StringRef Text,
                                std::vector<Symbol> IndexSymbols = {},
                                clangd::CodeCompleteOptions Opts = {},
                                PathRef FilePath = "foo.cpp") {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
+  Annotations Test(Text);
+  auto TU = TestTU::withCode(Test.code());
   // To make sure our tests for completiopns inside templates work on Windows.
-  CDB.ExtraClangFlags = {"-fno-delayed-template-parsing"};
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
-  return completions(Server, Text, std::move(IndexSymbols), std::move(Opts),
-                     FilePath);
+  TU.Filename = FilePath.str();
+  return completions(TU, Test.point(), std::move(IndexSymbols),
+                     std::move(Opts));
 }
 
-// Builds a server and runs code completion.
-// If IndexSymbols is non-empty, an index will be built and passed to opts.
+// Runs code completion without the clang parser.
 CodeCompleteResult completionsNoCompile(llvm::StringRef Text,
                                         std::vector<Symbol> IndexSymbols = {},
                                         clangd::CodeCompleteOptions Opts = {},
@@ -669,53 +655,38 @@ TEST(CompletionTest, SemaIndexMergeWithLimit) {
 }
 
 TEST(CompletionTest, IncludeInsertionPreprocessorIntegrationTests) {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-  std::string Subdir = testPath("sub");
-  std::string SearchDirArg = (Twine("-I") + Subdir).str();
-  CDB.ExtraClangFlags = {SearchDirArg.c_str()};
-  std::string BarHeader = testPath("sub/bar.h");
-  FS.Files[BarHeader] = "";
+  TestTU TU;
+  TU.ExtraArgs.push_back("-I" + testPath("sub"));
+  TU.AdditionalFiles["sub/bar.h"] = "";
+  auto BarURI = URI::create(testPath("sub/bar.h")).toString();
 
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
-  auto BarURI = URI::create(BarHeader).toString();
   Symbol Sym = cls("ns::X");
   Sym.CanonicalDeclaration.FileURI = BarURI.c_str();
   Sym.IncludeHeaders.emplace_back(BarURI, 1);
   // Shoten include path based on search directory and insert.
-  auto Results = completions(Server,
-                             R"cpp(
-          int main() { ns::^ }
-      )cpp",
-                             {Sym});
+  Annotations Test("int main() { ns::^ }");
+  TU.Code = Test.code().str();
+  auto Results = completions(TU, Test.point(), {Sym});
   EXPECT_THAT(Results.Completions,
               ElementsAre(AllOf(Named("X"), InsertInclude("\"bar.h\""))));
   // Can be disabled via option.
   CodeCompleteOptions NoInsertion;
   NoInsertion.InsertIncludes = CodeCompleteOptions::NeverInsert;
-  Results = completions(Server,
-                        R"cpp(
-          int main() { ns::^ }
-      )cpp",
-                        {Sym}, NoInsertion);
+  Results = completions(TU, Test.point(), {Sym}, NoInsertion);
   EXPECT_THAT(Results.Completions,
               ElementsAre(AllOf(Named("X"), Not(InsertInclude()))));
   // Duplicate based on inclusions in preamble.
-  Results = completions(Server,
-                        R"cpp(
+  Test = Annotations(R"cpp(
           #include "sub/bar.h"  // not shortest, so should only match resolved.
           int main() { ns::^ }
-      )cpp",
-                        {Sym});
+      )cpp");
+  TU.Code = Test.code().str();
+  Results = completions(TU, Test.point(), {Sym});
   EXPECT_THAT(Results.Completions, ElementsAre(AllOf(Named("X"), Labeled("X"),
                                                      Not(InsertInclude()))));
 }
 
 TEST(CompletionTest, NoIncludeInsertionWhenDeclFoundInFile) {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
   Symbol SymX = cls("ns::X");
   Symbol SymY = cls("ns::Y");
   std::string BarHeader = testPath("bar.h");
@@ -725,8 +696,7 @@ TEST(CompletionTest, NoIncludeInsertionWhenDeclFoundInFile) {
   SymX.IncludeHeaders.emplace_back("<bar>", 1);
   SymY.IncludeHeaders.emplace_back("<bar>", 1);
   // Shoten include path based on search directory and insert.
-  auto Results = completions(Server,
-                             R"cpp(
+  auto Results = completions(R"cpp(
           namespace ns {
             class X;
             class Y {};
@@ -740,34 +710,27 @@ TEST(CompletionTest, NoIncludeInsertionWhenDeclFoundInFile) {
 }
 
 TEST(CompletionTest, IndexSuppressesPreambleCompletions) {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
-
-  FS.Files[testPath("bar.h")] =
-      R"cpp(namespace ns { struct preamble { int member; }; })cpp";
-  auto File = testPath("foo.cpp");
   Annotations Test(R"cpp(
       #include "bar.h"
       namespace ns { int local; }
       void f() { ns::^; }
       void f2() { ns::preamble().$2^; }
   )cpp");
-  runAddDocument(Server, File, Test.code());
-  clangd::CodeCompleteOptions Opts = {};
+  auto TU = TestTU::withCode(Test.code());
+  TU.AdditionalFiles["bar.h"] =
+      R"cpp(namespace ns { struct preamble { int member; }; })cpp";
 
+  clangd::CodeCompleteOptions Opts = {};
   auto I = memIndex({var("ns::index")});
   Opts.Index = I.get();
-  auto WithIndex = cantFail(runCodeComplete(Server, File, Test.point(), Opts));
+  auto WithIndex = completions(TU, Test.point(), {}, Opts);
   EXPECT_THAT(WithIndex.Completions,
               UnorderedElementsAre(Named("local"), Named("index")));
-  auto ClassFromPreamble =
-      cantFail(runCodeComplete(Server, File, Test.point("2"), Opts));
+  auto ClassFromPreamble = completions(TU, Test.point("2"), {}, Opts);
   EXPECT_THAT(ClassFromPreamble.Completions, Contains(Named("member")));
 
   Opts.Index = nullptr;
-  auto WithoutIndex =
-      cantFail(runCodeComplete(Server, File, Test.point(), Opts));
+  auto WithoutIndex = completions(TU, Test.point(), {}, Opts);
   EXPECT_THAT(WithoutIndex.Completions,
               UnorderedElementsAre(Named("local"), Named("preamble")));
 }
@@ -811,7 +774,14 @@ TEST(CompletionTest, DynamicIndexIncludeInsertion) {
   Server.addDocument(testPath("foo_impl.cpp"), FileContent);
   // Wait for the dynamic index being built.
   ASSERT_TRUE(Server.blockUntilIdleForTest());
-  EXPECT_THAT(completions(Server, "Foo^ foo;").Completions,
+
+  auto File = testPath("foo.cpp");
+  Annotations Test("Foo^ foo;");
+  runAddDocument(Server, File, Test.code());
+  auto CompletionList =
+      llvm::cantFail(runCodeComplete(Server, File, Test.point(), {}));
+
+  EXPECT_THAT(CompletionList.Completions,
               ElementsAre(AllOf(Named("Foo"), HasInclude("\"foo_header.h\""),
                                 InsertInclude())));
 }
@@ -892,13 +862,17 @@ TEST(CompletionTest, CommentsFromSystemHeaders) {
     int foo();
   )cpp";
 
-  auto Results = completions(Server,
-                             R"cpp(
+  auto File = testPath("foo.cpp");
+  Annotations Test(R"cpp(
 #include "foo.h"
 int x = foo^
      )cpp");
+  runAddDocument(Server, File, Test.code());
+  auto CompletionList =
+      llvm::cantFail(runCodeComplete(Server, File, Test.point(), {}));
+
   EXPECT_THAT(
-      Results.Completions,
+      CompletionList.Completions,
       Contains(AllOf(Named("foo"), Doc("This comment should be retained!"))));
 }
 
@@ -1064,15 +1038,23 @@ SignatureHelp signatures(llvm::StringRef Text, Position Point,
   if (!IndexSymbols.empty())
     Index = memIndex(IndexSymbols);
 
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-  ClangdServer::Options Opts = ClangdServer::optsForTest();
-  Opts.StaticIndex = Index.get();
-
-  ClangdServer Server(CDB, FS, Opts);
-  auto File = testPath("foo.cpp");
-  runAddDocument(Server, File, Text);
-  return llvm::cantFail(runSignatureHelp(Server, File, Point));
+  auto TU = TestTU::withCode(Text);
+  auto Inputs = TU.inputs();
+  IgnoreDiagnostics Diags;
+  auto CI = buildCompilerInvocation(Inputs, Diags);
+  if (!CI) {
+    ADD_FAILURE() << "Couldn't build CompilerInvocation";
+    return {};
+  }
+  auto Preamble =
+      buildPreamble(testPath(TU.Filename), *CI, /*OldPreamble=*/nullptr, Inputs,
+                    /*InMemory=*/true, /*Callback=*/nullptr);
+  if (!Preamble) {
+    ADD_FAILURE() << "Couldn't build Preamble";
+    return {};
+  }
+  return signatureHelp(testPath(TU.Filename), Inputs.CompileCommand, *Preamble,
+                       Text, Point, Inputs.FS, Index.get());
 }
 
 SignatureHelp signatures(llvm::StringRef Text,
@@ -1546,14 +1528,7 @@ TEST(CompletionTest, DocumentationFromChangedFileCrash) {
 }
 
 TEST(CompletionTest, NonDocComments) {
-  MockFSProvider FS;
-  auto FooCpp = testPath("foo.cpp");
-  FS.Files[FooCpp] = "";
-
-  MockCompilationDatabase CDB;
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
-
-  Annotations Source(R"cpp(
+  const char *Text = R"cpp(
     // We ignore namespace comments, for rationale see CodeCompletionStrings.h.
     namespace comments_ns {
     }
@@ -1588,17 +1563,11 @@ TEST(CompletionTest, NonDocComments) {
     int Struct<T>::comments_quux() {
       int a = comments^;
     }
-  )cpp");
-  // FIXME: Auto-completion in a template requires disabling delayed template
-  // parsing.
-  CDB.ExtraClangFlags.push_back("-fno-delayed-template-parsing");
-  runAddDocument(Server, FooCpp, Source.code(), "null", WantDiagnostics::Yes);
-  CodeCompleteResult Completions = cantFail(runCodeComplete(
-      Server, FooCpp, Source.point(), clangd::CodeCompleteOptions()));
+  )cpp";
 
   // We should not get any of those comments in completion.
   EXPECT_THAT(
-      Completions.Completions,
+      completions(Text).Completions,
       UnorderedElementsAre(AllOf(Not(IsDocumented()), Named("comments_foo")),
                            AllOf(IsDocumented(), Named("comments_baz")),
                            AllOf(IsDocumented(), Named("comments_quux")),
@@ -1740,11 +1709,10 @@ TEST(CompletionTest, CodeCompletionContext) {
 TEST(CompletionTest, FixItForArrowToDot) {
   MockFSProvider FS;
   MockCompilationDatabase CDB;
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
 
   CodeCompleteOptions Opts;
   Opts.IncludeFixIts = true;
-  Annotations TestCode(
+  const char* Code =
       R"cpp(
         class Auxilary {
          public:
@@ -1760,13 +1728,12 @@ TEST(CompletionTest, FixItForArrowToDot) {
           ClassWithPtr x;
           x[[->]]^;
         }
-      )cpp");
-  auto Results =
-      completions(Server, TestCode.code(), TestCode.point(), {}, Opts);
+      )cpp";
+  auto Results = completions(Code, {}, Opts);
   EXPECT_EQ(Results.Completions.size(), 3u);
 
   TextEdit ReplacementEdit;
-  ReplacementEdit.range = TestCode.range();
+  ReplacementEdit.range = Annotations(Code).range();
   ReplacementEdit.newText = ".";
   for (const auto &C : Results.Completions) {
     EXPECT_TRUE(C.FixIts.size() == 1u || C.Name == "AuxFunction");
@@ -1777,13 +1744,9 @@ TEST(CompletionTest, FixItForArrowToDot) {
 }
 
 TEST(CompletionTest, FixItForDotToArrow) {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
-
   CodeCompleteOptions Opts;
   Opts.IncludeFixIts = true;
-  Annotations TestCode(
+  const char* Code =
       R"cpp(
         class Auxilary {
          public:
@@ -1799,13 +1762,12 @@ TEST(CompletionTest, FixItForDotToArrow) {
           ClassWithPtr x;
           x[[.]]^;
         }
-      )cpp");
-  auto Results =
-      completions(Server, TestCode.code(), TestCode.point(), {}, Opts);
+      )cpp";
+  auto Results = completions(Code, {}, Opts);
   EXPECT_EQ(Results.Completions.size(), 3u);
 
   TextEdit ReplacementEdit;
-  ReplacementEdit.range = TestCode.range();
+  ReplacementEdit.range = Annotations(Code).range();
   ReplacementEdit.newText = "->";
   for (const auto &C : Results.Completions) {
     EXPECT_TRUE(C.FixIts.empty() || C.Name == "AuxFunction");
@@ -1858,8 +1820,8 @@ TEST(CompletionTest, RenderWithFixItNonMerged) {
 TEST(CompletionTest, CompletionTokenRange) {
   MockFSProvider FS;
   MockCompilationDatabase CDB;
-  FS.Files["foo/abc/foo.h"] = "";
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
+  TestTU TU;
+  TU.AdditionalFiles["foo/abc/foo.h"] = "";
 
   constexpr const char *TestCodes[] = {
       R"cpp(
@@ -1891,10 +1853,10 @@ TEST(CompletionTest, CompletionTokenRange) {
       };
   for (const auto &Text : TestCodes) {
     Annotations TestCode(Text);
-    auto Results = completions(Server, TestCode.code(), TestCode.point());
-
+    TU.Code = TestCode.code().str();
+    auto Results = completions(TU, TestCode.point());
     if (Results.Completions.size() != 1) {
-      ADD_FAILURE() << "Results.Completions.size() != 1";
+      ADD_FAILURE() << "Results.Completions.size() != 1" << Text;
       continue;
     }
     EXPECT_THAT(Results.Completions.front().CompletionTokenRange,
@@ -2247,13 +2209,12 @@ TEST(CompletionTest, InsertTheMostPopularHeader) {
 }
 
 TEST(CompletionTest, NoInsertIncludeIfOnePresent) {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-
-  std::string FooHeader = testPath("foo.h");
-  FS.Files[FooHeader] = "";
-
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
+  Annotations Test(R"cpp(
+    #include "foo.h"
+    Fun^
+  )cpp");
+  auto TU = TestTU::withCode(Test.code());
+  TU.AdditionalFiles["foo.h"] = "";
 
   std::string DeclFile = URI::create(testPath("foo")).toString();
   Symbol Sym = func("Func");
@@ -2262,7 +2223,7 @@ TEST(CompletionTest, NoInsertIncludeIfOnePresent) {
   Sym.IncludeHeaders.emplace_back("\"bar.h\"", 1000);
 
   EXPECT_THAT(
-      completions(Server, "#include \"foo.h\"\nFun^", {Sym}).Completions,
+      completions(TU, Test.point(), {Sym}).Completions,
       UnorderedElementsAre(
           AllOf(Named("Func"), HasInclude("\"foo.h\""), Not(InsertInclude()))));
 }
@@ -2279,20 +2240,15 @@ TEST(CompletionTest, MergeMacrosFromIndexAndSema) {
 }
 
 TEST(CompletionTest, MacroFromPreamble) {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-  std::string FooHeader = testPath("foo.h");
-  FS.Files[FooHeader] = "#define CLANGD_PREAMBLE_HEADER x\n";
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
-  auto Results = completions(
-      R"cpp(#include "foo.h"
-          #define CLANGD_PREAMBLE_MAIN x
+  Annotations Test(R"cpp(#define CLANGD_PREAMBLE_MAIN x
 
           int x = 0;
           #define CLANGD_MAIN x
           void f() { CLANGD_^ }
-      )cpp",
-      {func("CLANGD_INDEX")});
+      )cpp");
+  auto TU = TestTU::withCode(Test.code());
+  TU.HeaderCode = "#define CLANGD_PREAMBLE_HEADER x";
+  auto Results = completions(TU, Test.point(), {func("CLANGD_INDEX")});
   // We should get results from the main file, including the preamble section.
   // However no results from included files (the index should cover them).
   EXPECT_THAT(Results.Completions,
@@ -2405,29 +2361,22 @@ TEST(SignatureHelpTest, ConstructorInitializeFields) {
 }
 
 TEST(CompletionTest, IncludedCompletionKinds) {
-  MockFSProvider FS;
-  MockCompilationDatabase CDB;
-  std::string Subdir = testPath("sub");
-  std::string SearchDirArg = (Twine("-I") + Subdir).str();
-  CDB.ExtraClangFlags = {SearchDirArg.c_str()};
-  std::string BarHeader = testPath("sub/bar.h");
-  FS.Files[BarHeader] = "";
-  ClangdServer Server(CDB, FS, ClangdServer::optsForTest());
-  auto Results = completions(Server,
-                             R"cpp(
-        #include "^"
-      )cpp");
+  Annotations Test(R"cpp(#include "^")cpp");
+  auto TU = TestTU::withCode(Test.code());
+  TU.AdditionalFiles["sub/bar.h"] = "";
+  TU.ExtraArgs.push_back("-I" + testPath("sub"));
+
+  auto Results = completions(TU, Test.point());
   EXPECT_THAT(Results.Completions,
               AllOf(Has("sub/", CompletionItemKind::Folder),
                     Has("bar.h\"", CompletionItemKind::File)));
 }
 
 TEST(CompletionTest, NoCrashAtNonAlphaIncludeHeader) {
-  auto Results = completions(
+  completions(
       R"cpp(
         #include "./^"
       )cpp");
-  EXPECT_TRUE(Results.Completions.empty());
 }
 
 TEST(CompletionTest, NoAllScopesCompletionWhenQualified) {
@@ -2714,6 +2663,20 @@ TEST(CompletionTest, NoCrashWithIncompleteLambda) {
   EXPECT_THAT(Signatures, Contains(Sig("x() -> auto")));
 }
 
+TEST(CompletionTest, DelayedTemplateParsing) {
+  Annotations Test(R"cpp(
+    int xxx;
+    template <typename T> int foo() { return xx^; }
+  )cpp");
+  auto TU = TestTU::withCode(Test.code());
+  // Even though delayed-template-parsing is on, we will disable it to provide
+  // completion in templates.
+  TU.ExtraArgs.push_back("-fdelayed-template-parsing");
+
+  EXPECT_THAT(completions(TU, Test.point()).Completions,
+              Contains(Named("xxx")));
+}
+
 TEST(CompletionTest, CompletionRange) {
   const char *WithRange = "auto x = [[abc]]^";
   auto Completions = completions(WithRange);
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index c38ccc3f9441..7b6fff292e66 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -65,7 +65,7 @@ class TargetDeclTest : public ::testing::Test {
 protected:
   using Rel = DeclRelation;
   std::string Code;
-  std::vector<const char *> Flags;
+  std::vector<std::string> Flags;
 
   // Asserts that `Code` has a marked selection of a node `NodeType`,
   // and returns allTargetDecls() as PrintedDecl structs.
@@ -132,6 +132,16 @@ TEST_F(TargetDeclTest, Exprs) {
   EXPECT_DECLS("CXXOperatorCallExpr", "void operator()(int n)");
 }
 
+TEST_F(TargetDeclTest, Recovery) {
+  Code = R"cpp(
+    // error-ok: testing behavior on broken code
+    int f();
+    int f(int, int);
+    int x = [[f]](42);
+  )cpp";
+  EXPECT_DECLS("UnresolvedLookupExpr", "int f()", "int f(int, int)");
+}
+
 TEST_F(TargetDeclTest, UsingDecl) {
   Code = R"cpp(
     namespace foo {
@@ -685,6 +695,15 @@ TEST_F(FindExplicitReferencesTest, All) {
         )cpp",
         "0: targets = {x}\n"
         "1: targets = {X::a}\n"},
+       {R"cpp(
+        // error-ok: testing with broken code
+        int bar();
+        int foo() {
+          return $0^bar() + $1^bar(42);
+        }
+        )cpp",
+        "0: targets = {bar}\n"
+        "1: targets = {bar}\n"},
        // Namespaces and aliases.
        {R"cpp(
           namespace ns {}
diff --git a/clang-tools-extra/clangd/unittests/TestTU.cpp b/clang-tools-extra/clangd/unittests/TestTU.cpp
index 909c125aed2e..2adcfc338cc2 100644
--- a/clang-tools-extra/clangd/unittests/TestTU.cpp
+++ b/clang-tools-extra/clangd/unittests/TestTU.cpp
@@ -20,7 +20,7 @@
 namespace clang {
 namespace clangd {
 
-ParsedAST TestTU::build() const {
+ParseInputs TestTU::inputs() const {
   std::string FullFilename = testPath(Filename),
               FullHeaderName = testPath(HeaderFilename),
               ImportThunk = testPath("import_thunk.h");
@@ -34,43 +34,48 @@ ParsedAST TestTU::build() const {
   Files[FullHeaderName] = HeaderCode;
   Files[ImportThunk] = ThunkContents;
 
-  std::vector<const char *> Cmd = {"clang"};
+  ParseInputs Inputs;
+  auto& Argv = Inputs.CompileCommand.CommandLine;
+  Argv = {"clang"};
   // FIXME: this shouldn't need to be conditional, but it breaks a
   // GoToDefinition test for some reason (getMacroArgExpandedLocation fails).
   if (!HeaderCode.empty()) {
-    Cmd.push_back("-include");
-    Cmd.push_back(ImplicitHeaderGuard ? ImportThunk.c_str()
-                                      : FullHeaderName.c_str());
+    Argv.push_back("-include");
+    Argv.push_back(ImplicitHeaderGuard ? ImportThunk : FullHeaderName);
     // ms-compatibility changes the meaning of #import.
     // The default is OS-dependent (on on windows), ensure it's off.
     if (ImplicitHeaderGuard)
-      Cmd.push_back("-fno-ms-compatibility");
+      Inputs.CompileCommand.CommandLine.push_back("-fno-ms-compatibility");
   }
-  Cmd.insert(Cmd.end(), ExtraArgs.begin(), ExtraArgs.end());
+  Argv.insert(Argv.end(), ExtraArgs.begin(), ExtraArgs.end());
   // Put the file name at the end -- this allows the extra arg (-xc++) to
   // override the language setting.
-  Cmd.push_back(FullFilename.c_str());
-  ParseInputs Inputs;
+  Argv.push_back(FullFilename);
   Inputs.CompileCommand.Filename = FullFilename;
-  Inputs.CompileCommand.CommandLine = {Cmd.begin(), Cmd.end()};
   Inputs.CompileCommand.Directory = testRoot();
   Inputs.Contents = Code;
   Inputs.FS = buildTestFS(Files);
   Inputs.Opts = ParseOptions();
+  Inputs.Opts.BuildRecoveryAST = true;
   Inputs.Opts.ClangTidyOpts.Checks = ClangTidyChecks;
   Inputs.Opts.ClangTidyOpts.WarningsAsErrors = ClangTidyWarningsAsErrors;
   Inputs.Index = ExternalIndex;
   if (Inputs.Index)
     Inputs.Opts.SuggestMissingIncludes = true;
+  return Inputs;
+}
+
+ParsedAST TestTU::build() const {
+  auto Inputs = inputs();
   StoreDiags Diags;
   auto CI = buildCompilerInvocation(Inputs, Diags);
   assert(CI && "Failed to build compilation invocation.");
   auto Preamble =
-      buildPreamble(FullFilename, *CI,
+      buildPreamble(testPath(Filename), *CI,
                     /*OldPreamble=*/nullptr, Inputs,
                     /*StoreInMemory=*/true, /*PreambleCallback=*/nullptr);
-  auto AST =
-      buildAST(FullFilename, std::move(CI), Diags.take(), Inputs, Preamble);
+  auto AST = buildAST(testPath(Filename), std::move(CI), Diags.take(), Inputs,
+                      Preamble);
   if (!AST.hasValue()) {
     ADD_FAILURE() << "Failed to build code:\n" << Code;
     llvm_unreachable("Failed to build TestTU!");
@@ -79,9 +84,17 @@ ParsedAST TestTU::build() const {
   // This guards against accidental syntax errors silently subverting tests.
   // error-ok is awfully primitive - using clang -verify would be nicer.
   // Ownership and layering makes it pretty hard.
-  if (llvm::none_of(Files, [](const auto &KV) {
-        return llvm::StringRef(KV.second).contains("error-ok");
-      })) {
+  bool ErrorOk = [&, this] {
+    llvm::StringLiteral Marker = "error-ok";
+    if (llvm::StringRef(Code).contains(Marker) ||
+        llvm::StringRef(HeaderCode).contains(Marker))
+      return true;
+    for (const auto& KV : this->AdditionalFiles)
+      if (llvm::StringRef(KV.second).contains(Marker))
+        return true;
+    return false;
+  }();
+  if (!ErrorOk) {
     for (const auto &D : AST->getDiagnostics())
       if (D.Severity >= DiagnosticsEngine::Error) {
         ADD_FAILURE()
diff --git a/clang-tools-extra/clangd/unittests/TestTU.h b/clang-tools-extra/clangd/unittests/TestTU.h
index 4668543d5b4d..229f65a4b95c 100644
--- a/clang-tools-extra/clangd/unittests/TestTU.h
+++ b/clang-tools-extra/clangd/unittests/TestTU.h
@@ -17,6 +17,7 @@
 #ifndef LLVM_CLANG_TOOLS_EXTRA_UNITTESTS_CLANGD_TESTTU_H
 #define LLVM_CLANG_TOOLS_EXTRA_UNITTESTS_CLANGD_TESTTU_H
 
+#include "Compiler.h"
 #include "ParsedAST.h"
 #include "Path.h"
 #include "index/Index.h"
@@ -54,7 +55,7 @@ struct TestTU {
   llvm::StringMap<std::string> AdditionalFiles;
 
   // Extra arguments for the compiler invocation.
-  std::vector<const char *> ExtraArgs;
+  std::vector<std::string> ExtraArgs;
 
   llvm::Optional<std::string> ClangTidyChecks;
   llvm::Optional<std::string> ClangTidyWarningsAsErrors;
@@ -67,6 +68,7 @@ struct TestTU {
   // By default, build() will report Error diagnostics as GTest errors.
   // Suppress this behavior by adding an 'error-ok' comment to the code.
   ParsedAST build() const;
+  ParseInputs inputs() const;
   SymbolSlab headerSymbols() const;
   std::unique_ptr<SymbolIndex> index() const;
 };
diff --git a/clang-tools-extra/clangd/unittests/TweakTesting.h b/clang-tools-extra/clangd/unittests/TweakTesting.h
index 10186f859bae..c771149a72fc 100644
--- a/clang-tools-extra/clangd/unittests/TweakTesting.h
+++ b/clang-tools-extra/clangd/unittests/TweakTesting.h
@@ -66,7 +66,7 @@ class TweakTest : public ::testing::Test {
   llvm::StringRef FileName = "TestTU.cpp";
 
   // Extra flags passed to the compilation in apply().
-  std::vector<const char *> ExtraArgs;
+  std::vector<std::string> ExtraArgs;
 
   // Context in which snippets of code should be placed to run tweaks.
   CodeContext Context = File;
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability-convert-member-functions-to-static.rst b/clang-tools-extra/docs/clang-tidy/checks/readability-convert-member-functions-to-static.rst
index 891f6be63714..c2f05cf589ea 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability-convert-member-functions-to-static.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability-convert-member-functions-to-static.rst
@@ -10,5 +10,5 @@ After applying modifications as suggested by the check, runnnig the check again
 might find more opportunities to mark member functions ``static``.
 
 After making a member function ``static``, you might want to run the check
-`readability-static-accessed-through-instance` to replace calls like
+`readability-static-accessed-through-instance <readability-static-accessed-through-instance.html>`_ to replace calls like
 ``Instance.method()`` by ``Class::method()``.
diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index 0d359a1609a5..3d1e961ada8d 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -86,6 +86,8 @@ set(LIBCXXABI_TARGET_TRIPLE                 "${CMAKE_C_COMPILER_TARGET}" CACHE S
 set(LIBCXXABI_SYSROOT                       "${DEFAULT_SYSROOT}" CACHE STRING "")
 set(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI OFF CACHE BOOL "")
 set(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX    OFF CACHE BOOL "")
+set(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI    OFF CACHE BOOL "")
+set(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX       OFF CACHE BOOL "")
 
 set(LIBCXX_USE_COMPILER_RT                  ON CACHE BOOL "")
 set(LIBCXX_TARGET_TRIPLE                    "${CMAKE_C_COMPILER_TARGET}" CACHE STRING "")
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 209a77440537..7fef9e867885 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -229,7 +229,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | base language                | lambda support                                               | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc extension               | array shaping                                                | :part:`worked on`        | D74144                                                                |
+| misc extension               | array shaping                                                | :good:`done`             | D74144                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc extension               | library shutdown (omp_pause_resource[_all])                  | :none:`unclaimed parts`  | D55078                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 365951a360bb..41f1086a410f 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -140,7 +140,8 @@ def NoEscapeDocs : Documentation {
 the compiler that the pointer cannot escape: that is, no reference to the object
 the pointer points to that is derived from the parameter value will survive
 after the function returns. Users are responsible for making sure parameters
-annotated with ``noescape`` do not actuallly escape.
+annotated with ``noescape`` do not actuallly escape. Calling ``free()`` on such
+a parameter does not constitute an escape.
 
 For example:
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b9e16e695a39..1b2073b050f4 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5987,7 +5987,7 @@ def err_func_def_incomplete_result : Error<
 def err_atomic_specifier_bad_type : Error<
   "_Atomic cannot be applied to "
   "%select{incomplete |array |function |reference |atomic |qualified |sizeless |}0type "
-  "%1 %select{||||||which is not trivially copyable}0">;
+  "%1 %select{|||||||which is not trivially copyable}0">;
 
 // Expressions.
 def select_unary_expr_or_type_trait_kind : TextSubstitution<
@@ -10244,7 +10244,8 @@ def warn_nested_declare_variant
               "nested context ignored">,
       InGroup<SourceUsesOpenMP>;
 def err_omp_non_pointer_type_array_shaping_base : Error<
-  "expected pointer type expression as a base of an array shaping operation">;
+  "expected expression with a pointer to a complete type as a base of an array "
+  "shaping operation">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index 9c2bc155cd4f..c47eb4587a57 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -662,7 +662,7 @@ class ASTSourceDescriptor {
   StringRef Path;
   StringRef ASTFile;
   ASTFileSignature Signature;
-  const Module *ClangModule = nullptr;
+  Module *ClangModule = nullptr;
 
 public:
   ASTSourceDescriptor() = default;
@@ -670,13 +670,13 @@ class ASTSourceDescriptor {
                       ASTFileSignature Signature)
       : PCHModuleName(std::move(Name)), Path(std::move(Path)),
         ASTFile(std::move(ASTFile)), Signature(Signature) {}
-  ASTSourceDescriptor(const Module &M);
+  ASTSourceDescriptor(Module &M);
 
   std::string getModuleName() const;
   StringRef getPath() const { return Path; }
   StringRef getASTFile() const { return ASTFile; }
   ASTFileSignature getSignature() const { return Signature; }
-  const Module *getModuleOrNull() const { return ClangModule; }
+  Module *getModuleOrNull() const { return ClangModule; }
 };
 
 
diff --git a/clang/include/clang/CodeGen/CodeGenABITypes.h b/clang/include/clang/CodeGen/CodeGenABITypes.h
index 31f0cea57232..5f4af7fd2a36 100644
--- a/clang/include/clang/CodeGen/CodeGenABITypes.h
+++ b/clang/include/clang/CodeGen/CodeGenABITypes.h
@@ -28,11 +28,12 @@
 #include "clang/CodeGen/CGFunctionInfo.h"
 
 namespace llvm {
-  class DataLayout;
-  class Module;
-  class Function;
-  class FunctionType;
-  class Type;
+class Constant;
+class DataLayout;
+class Module;
+class Function;
+class FunctionType;
+class Type;
 }
 
 namespace clang {
@@ -44,6 +45,7 @@ class CoverageSourceInfo;
 class DiagnosticsEngine;
 class HeaderSearchOptions;
 class ObjCMethodDecl;
+class ObjCProtocolDecl;
 class PreprocessorOptions;
 
 namespace CodeGen {
@@ -137,6 +139,13 @@ llvm::Function *getNonTrivialCStructDestructor(CodeGenModule &CGM,
                                                CharUnits DstAlignment,
                                                bool IsVolatile, QualType QT);
 
+/// Get a pointer to a protocol object for the given declaration, emitting it if
+/// it hasn't already been emitted in this translation unit. Note that the ABI
+/// for emitting a protocol reference in code (e.g. for a protocol expression)
+/// in most runtimes is not as simple as just materializing a pointer to this
+/// object.
+llvm::Constant *emitObjCProtocolObject(CodeGenModule &CGM,
+                                       const ObjCProtocolDecl *p);
 }  // end namespace CodeGen
 }  // end namespace clang
 
diff --git a/clang/include/clang/Driver/CC1Options.td b/clang/include/clang/Driver/CC1Options.td
index 2224c152f626..218404e26409 100644
--- a/clang/include/clang/Driver/CC1Options.td
+++ b/clang/include/clang/Driver/CC1Options.td
@@ -282,8 +282,6 @@ def no_struct_path_tbaa : Flag<["-"], "no-struct-path-tbaa">,
   HelpText<"Turn off struct-path aware Type Based Alias Analysis">;
 def new_struct_path_tbaa : Flag<["-"], "new-struct-path-tbaa">,
   HelpText<"Enable enhanced struct-path aware Type Based Alias Analysis">;
-def masm_verbose : Flag<["-"], "masm-verbose">,
-  HelpText<"Generate verbose assembly output">;
 def mdebug_pass : Separate<["-"], "mdebug-pass">,
   HelpText<"Enable additional debug output">;
 def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">,
@@ -455,8 +453,6 @@ def fspell_checking_limit : Separate<["-"], "fspell-checking-limit">, MetaVarNam
 def fcaret_diagnostics_max_lines :
   Separate<["-"], "fcaret-diagnostics-max-lines">, MetaVarName<"<N>">,
   HelpText<"Set the maximum number of source lines to show in a caret diagnostic">;
-def fmessage_length : Separate<["-"], "fmessage-length">, MetaVarName<"<N>">,
-  HelpText<"Format message diagnostics so that they fit within N columns or fewer, when possible.">;
 def verify_EQ : CommaJoined<["-"], "verify=">,
   MetaVarName<"<prefixes>">,
   HelpText<"Verify diagnostic output using comment directives that start with"
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d65cf30a73a8..97128d623a13 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -918,7 +918,7 @@ def fdiagnostics_hotness_threshold_EQ : Joined<["-"], "fdiagnostics-hotness-thre
     Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<number>">,
     HelpText<"Prevent optimization remarks from being output if they do not have at least this profile count">;
 def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<f_Group>,
-    Flags<[CC1Option]>, HelpText<"Print option name with mappable diagnostics">;
+    HelpText<"Print option name with mappable diagnostics">;
 def fdiagnostics_show_note_include_stack : Flag<["-"], "fdiagnostics-show-note-include-stack">,
     Group<f_Group>,  Flags<[CC1Option]>, HelpText<"Display include stacks for diagnostic notes">;
 def fdiagnostics_format_EQ : Joined<["-"], "fdiagnostics-format=">, Group<f_clang_Group>;
@@ -1383,7 +1383,8 @@ def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">,
                                 Group<f_Group>, Flags<[DriverOption, CoreOption]>;
 def fmerge_all_constants : Flag<["-"], "fmerge-all-constants">, Group<f_Group>,
   Flags<[CC1Option, CoreOption]>, HelpText<"Allow merging of constants">;
-def fmessage_length_EQ : Joined<["-"], "fmessage-length=">, Group<f_Group>;
+def fmessage_length_EQ : Joined<["-"], "fmessage-length=">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Format message diagnostics so that they fit within N columns">;
 def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
   HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">;
 def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
@@ -1533,7 +1534,7 @@ def fno_cxx_modules : Flag <["-"], "fno-cxx-modules">, Group<f_Group>,
 def fno_diagnostics_fixit_info : Flag<["-"], "fno-diagnostics-fixit-info">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Do not include fixit information in diagnostics">;
 def fno_diagnostics_show_hotness : Flag<["-"], "fno-diagnostics-show-hotness">, Group<f_Group>;
-def fno_diagnostics_show_option : Flag<["-"], "fno-diagnostics-show-option">, Group<f_Group>;
+def fno_diagnostics_show_option : Flag<["-"], "fno-diagnostics-show-option">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_diagnostics_show_note_include_stack : Flag<["-"], "fno-diagnostics-show-note-include-stack">,
     Flags<[CC1Option]>, Group<f_Group>;
 def fdigraphs : Flag<["-"], "fdigraphs">, Group<f_Group>, Flags<[CC1Option]>,
@@ -1626,7 +1627,7 @@ def fno_register_global_dtors_with_atexit : Flag<["-"], "fno-register-global-dto
   HelpText<"Don't use atexit or __cxa_atexit to register global destructors">;
 def fno_unit_at_a_time : Flag<["-"], "fno-unit-at-a-time">, Group<f_Group>;
 def fno_unwind_tables : Flag<["-"], "fno-unwind-tables">, Group<f_Group>;
-def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>;
+def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
 def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>;
 def fno_zero_initialized_in_bss : Flag<["-"], "fno-zero-initialized-in-bss">, Group<f_Group>;
@@ -1983,7 +1984,8 @@ def fuse_init_array : Flag<["-"], "fuse-init-array">, Group<f_Group>,
 def fno_use_init_array : Flag<["-"], "fno-use-init-array">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Don't use .init_array instead of .ctors">;
 def fno_var_tracking : Flag<["-"], "fno-var-tracking">, Group<clang_ignored_f_Group>;
-def fverbose_asm : Flag<["-"], "fverbose-asm">, Group<f_Group>;
+def fverbose_asm : Flag<["-"], "fverbose-asm">, Group<f_Group>,
+  HelpText<"Generate verbose assembly output">;
 def dA : Flag<["-"], "dA">, Alias<fverbose_asm>;
 def fvisibility_EQ : Joined<["-"], "fvisibility=">, Group<f_Group>,
   HelpText<"Set the default symbol visibility for all global declarations">, Values<"hidden,default">;
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index f3253d5b40e3..25476f78a6a0 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -59,8 +59,7 @@ class TargetOptions;
 /// report the error(s).
 bool ParseDiagnosticArgs(DiagnosticOptions &Opts, llvm::opt::ArgList &Args,
                          DiagnosticsEngine *Diags = nullptr,
-                         bool DefaultDiagColor = true,
-                         bool DefaultShowOpt = true);
+                         bool DefaultDiagColor = true);
 
 class CompilerInvocationBase {
 public:
diff --git a/clang/include/clang/Frontend/FrontendAction.h b/clang/include/clang/Frontend/FrontendAction.h
index e994e24cf5af..c9f9f080c141 100644
--- a/clang/include/clang/Frontend/FrontendAction.h
+++ b/clang/include/clang/Frontend/FrontendAction.h
@@ -312,6 +312,7 @@ class WrapperFrontendAction : public FrontendAction {
   bool BeginSourceFileAction(CompilerInstance &CI) override;
   void ExecuteAction() override;
   void EndSourceFileAction() override;
+  bool shouldEraseOutputFiles() override;
 
 public:
   /// Construct a WrapperFrontendAction from an existing action, taking
diff --git a/clang/include/clang/Frontend/FrontendActions.h b/clang/include/clang/Frontend/FrontendActions.h
index 89ac20075fa4..9ca2bfda2138 100644
--- a/clang/include/clang/Frontend/FrontendActions.h
+++ b/clang/include/clang/Frontend/FrontendActions.h
@@ -119,17 +119,13 @@ class GenerateModuleAction : public ASTFrontendAction {
   bool hasASTFileSupport() const override { return false; }
 };
 
-class GenerateInterfaceStubAction : public ASTFrontendAction {
-protected:
-  TranslationUnitKind getTranslationUnitKind() override { return TU_Module; }
-
-  bool hasASTFileSupport() const override { return false; }
-};
-
-class GenerateInterfaceIfsExpV1Action : public GenerateInterfaceStubAction {
+class GenerateInterfaceStubsAction : public ASTFrontendAction {
 protected:
   std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
                                                  StringRef InFile) override;
+
+  TranslationUnitKind getTranslationUnitKind() override { return TU_Module; }
+  bool hasASTFileSupport() const override { return false; }
 };
 
 class GenerateModuleFromModuleMapAction : public GenerateModuleAction {
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index 66fec6436a40..6069b5eea265 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -90,7 +90,7 @@ enum ActionKind {
   GeneratePCH,
 
   /// Generate Interface Stub Files.
-  GenerateInterfaceIfsExpV1,
+  GenerateInterfaceStubs,
 
   /// Only execute frontend initialization.
   InitOnly,
diff --git a/clang/lib/Analysis/RetainSummaryManager.cpp b/clang/lib/Analysis/RetainSummaryManager.cpp
index 00bc854a8804..9f45a8efe546 100644
--- a/clang/lib/Analysis/RetainSummaryManager.cpp
+++ b/clang/lib/Analysis/RetainSummaryManager.cpp
@@ -146,7 +146,9 @@ static bool isSubclass(const Decl *D,
 }
 
 static bool isOSObjectSubclass(const Decl *D) {
-  return D && isSubclass(D, "OSMetaClassBase");
+  // OSSymbols are particular OSObjects that are allocated globally
+  // and therefore aren't really refcounted, so we ignore them.
+  return D && isSubclass(D, "OSMetaClassBase") && !isSubclass(D, "OSSymbol");
 }
 
 static bool isOSObjectDynamicCast(StringRef S) {
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index dd8f11101107..5fd7d304f8f4 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -659,7 +659,7 @@ void VisibleModuleSet::setVisible(Module *M, SourceLocation Loc,
   VisitModule({M, nullptr});
 }
 
-ASTSourceDescriptor::ASTSourceDescriptor(const Module &M)
+ASTSourceDescriptor::ASTSourceDescriptor(Module &M)
     : Signature(M.Signature), ClangModule(&M) {
   if (M.Directory)
     Path = M.Directory->getName();
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 49c57e9860a6..6d3c2ad66cdc 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1817,9 +1817,10 @@ CGDebugInfo::CollectTemplateParams(const TemplateParameterList *TPList,
         if (auto *templateType =
                 dyn_cast_or_null<NonTypeTemplateParmDecl>(TPList->getParam(i)))
           if (templateType->hasDefaultArgument())
-            defaultParameter =
+            defaultParameter = llvm::APSInt::isSameValue(
                 templateType->getDefaultArgument()->EvaluateKnownConstInt(
-                    CGM.getContext()) == TA.getAsIntegral();
+                    CGM.getContext()),
+                TA.getAsIntegral());
 
       TemplateParams.push_back(DBuilder.createTemplateValueParameter(
           TheCU, Name, TTy, defaultParameter,
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index db78309e9fd9..35b926808492 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -617,6 +617,13 @@ class CGObjCGNU : public CGObjCRuntime {
   llvm::Value *GenerateProtocolRef(CodeGenFunction &CGF,
                                    const ObjCProtocolDecl *PD) override;
   void GenerateProtocol(const ObjCProtocolDecl *PD) override;
+
+  virtual llvm::Constant *GenerateProtocolRef(const ObjCProtocolDecl *PD);
+
+  llvm::Constant *GetOrEmitProtocol(const ObjCProtocolDecl *PD) override {
+    return GenerateProtocolRef(PD);
+  }
+
   llvm::Function *ModuleInitFunction() override;
   llvm::FunctionCallee GetPropertyGetFunction() override;
   llvm::FunctionCallee GetPropertySetFunction() override;
@@ -1348,7 +1355,7 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
   void GenerateProtocol(const ObjCProtocolDecl *PD) override {
     // Do nothing - we only emit referenced protocols.
   }
-  llvm::Constant *GenerateProtocolRef(const ObjCProtocolDecl *PD) {
+  llvm::Constant *GenerateProtocolRef(const ObjCProtocolDecl *PD) override {
     std::string ProtocolName = PD->getNameAsString();
     auto *&Protocol = ExistingProtocols[ProtocolName];
     if (Protocol)
@@ -3039,13 +3046,18 @@ CGObjCGNU::GenerateProtocolList(ArrayRef<std::string> Protocols) {
 
 llvm::Value *CGObjCGNU::GenerateProtocolRef(CodeGenFunction &CGF,
                                             const ObjCProtocolDecl *PD) {
+  auto protocol = GenerateProtocolRef(PD);
+  llvm::Type *T =
+      CGM.getTypes().ConvertType(CGM.getContext().getObjCProtoType());
+  return CGF.Builder.CreateBitCast(protocol, llvm::PointerType::getUnqual(T));
+}
+
+llvm::Constant *CGObjCGNU::GenerateProtocolRef(const ObjCProtocolDecl *PD) {
   llvm::Constant *&protocol = ExistingProtocols[PD->getNameAsString()];
   if (!protocol)
     GenerateProtocol(PD);
   assert(protocol && "Unknown protocol");
-  llvm::Type *T =
-    CGM.getTypes().ConvertType(CGM.getContext().getObjCProtoType());
-  return CGF.Builder.CreateBitCast(protocol, llvm::PointerType::getUnqual(T));
+  return protocol;
 }
 
 llvm::Constant *
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 87fd51b5d8b1..3986310eaa70 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -1107,11 +1107,6 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime {
 
   void GenerateProtocol(const ObjCProtocolDecl *PD) override;
 
-  /// GetOrEmitProtocol - Get the protocol object for the given
-  /// declaration, emitting it if necessary. The return value has type
-  /// ProtocolPtrTy.
-  virtual llvm::Constant *GetOrEmitProtocol(const ObjCProtocolDecl *PD)=0;
-
   /// GetOrEmitProtocolRef - Get a forward reference to the protocol
   /// object for the given declaration, emitting it if needed. These
   /// forward references will be filled in with empty bodies if no
diff --git a/clang/lib/CodeGen/CGObjCRuntime.cpp b/clang/lib/CodeGen/CGObjCRuntime.cpp
index c34758c7e3b3..39efe040302d 100644
--- a/clang/lib/CodeGen/CGObjCRuntime.cpp
+++ b/clang/lib/CodeGen/CGObjCRuntime.cpp
@@ -13,14 +13,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGObjCRuntime.h"
-#include "CGCleanup.h"
 #include "CGCXXABI.h"
+#include "CGCleanup.h"
 #include "CGRecordLayout.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/StmtObjC.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
+#include "clang/CodeGen/CodeGenABITypes.h"
 #include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
@@ -383,3 +384,9 @@ CGObjCRuntime::getMessageSendInfo(const ObjCMethodDecl *method,
     CGM.getTypes().GetFunctionType(argsInfo)->getPointerTo();
   return MessageSendInfo(argsInfo, signatureType);
 }
+
+llvm::Constant *
+clang::CodeGen::emitObjCProtocolObject(CodeGenModule &CGM,
+                                       const ObjCProtocolDecl *protocol) {
+  return CGM.getObjCRuntime().GetOrEmitProtocol(protocol);
+}
diff --git a/clang/lib/CodeGen/CGObjCRuntime.h b/clang/lib/CodeGen/CGObjCRuntime.h
index f0b3525cfde2..a2c189585f7b 100644
--- a/clang/lib/CodeGen/CGObjCRuntime.h
+++ b/clang/lib/CodeGen/CGObjCRuntime.h
@@ -211,6 +211,11 @@ class CGObjCRuntime {
   /// implementations.
   virtual void GenerateProtocol(const ObjCProtocolDecl *OPD) = 0;
 
+  /// GetOrEmitProtocol - Get the protocol object for the given
+  /// declaration, emitting it if necessary. The return value has type
+  /// ProtocolPtrTy.
+  virtual llvm::Constant *GetOrEmitProtocol(const ObjCProtocolDecl *PD) = 0;
+
   /// Generate a function preamble for a method with the specified
   /// types.
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index ae98433acb48..6642851a56bc 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -5374,7 +5374,7 @@ std::pair<llvm::Value *, Address> CGOpenMPRuntime::emitDependClause(
       llvm::Value *Size;
       QualType Ty = E->getType();
       if (OASE) {
-        Size = llvm::ConstantInt::get(CGF.SizeTy,/*V=*/1);
+        Size = CGF.getTypeSize(OASE->getBase()->getType()->getPointeeType());
         for (const Expr *SE : OASE->getDimensions()) {
            llvm::Value *Sz = CGF.EmitScalarExpr(SE);
            Sz = CGF.EmitScalarConversion(Sz, SE->getType(),
@@ -7448,6 +7448,20 @@ class MappableExprsHandler {
   llvm::Value *getExprTypeSize(const Expr *E) const {
     QualType ExprTy = E->getType().getCanonicalType();
 
+    // Calculate the size for array shaping expression.
+    if (const auto *OAE = dyn_cast<OMPArrayShapingExpr>(E)) {
+      llvm::Value *Size =
+          CGF.getTypeSize(OAE->getBase()->getType()->getPointeeType());
+      for (const Expr *SE : OAE->getDimensions()) {
+        llvm::Value *Sz = CGF.EmitScalarExpr(SE);
+        Sz = CGF.EmitScalarConversion(Sz, SE->getType(),
+                                      CGF.getContext().getSizeType(),
+                                      SE->getExprLoc());
+        Size = CGF.Builder.CreateNUWMul(Size, Sz);
+      }
+      return Size;
+    }
+
     // Reference types are ignored for mapping purposes.
     if (const auto *RefTy = ExprTy->getAs<ReferenceType>())
       ExprTy = RefTy->getPointeeType().getCanonicalType();
@@ -7779,6 +7793,7 @@ class MappableExprsHandler {
     const Expr *AssocExpr = I->getAssociatedExpression();
     const auto *AE = dyn_cast<ArraySubscriptExpr>(AssocExpr);
     const auto *OASE = dyn_cast<OMPArraySectionExpr>(AssocExpr);
+    const auto *OAShE = dyn_cast<OMPArrayShapingExpr>(AssocExpr);
 
     if (isa<MemberExpr>(AssocExpr)) {
       // The base is the 'this' pointer. The content of the pointer is going
@@ -7788,6 +7803,11 @@ class MappableExprsHandler {
                (OASE &&
                 isa<CXXThisExpr>(OASE->getBase()->IgnoreParenImpCasts()))) {
       BP = CGF.EmitOMPSharedLValue(AssocExpr).getAddress(CGF);
+    } else if (OAShE &&
+               isa<CXXThisExpr>(OAShE->getBase()->IgnoreParenCasts())) {
+      BP = Address(
+          CGF.EmitScalarExpr(OAShE->getBase()),
+          CGF.getContext().getTypeAlignInChars(OAShE->getBase()->getType()));
     } else {
       // The base is the reference to the variable.
       // BP = &Var.
@@ -7870,9 +7890,12 @@ class MappableExprsHandler {
       // types.
       const auto *OASE =
           dyn_cast<OMPArraySectionExpr>(I->getAssociatedExpression());
+      const auto *OAShE =
+          dyn_cast<OMPArrayShapingExpr>(I->getAssociatedExpression());
       const auto *UO = dyn_cast<UnaryOperator>(I->getAssociatedExpression());
       const auto *BO = dyn_cast<BinaryOperator>(I->getAssociatedExpression());
       bool IsPointer =
+          OAShE ||
           (OASE && OMPArraySectionExpr::getBaseOriginalType(OASE)
                        .getCanonicalType()
                        ->isAnyPointerType()) ||
@@ -7890,8 +7913,15 @@ class MappableExprsHandler {
                 isa<BinaryOperator>(Next->getAssociatedExpression())) &&
                "Unexpected expression");
 
-        Address LB = CGF.EmitOMPSharedLValue(I->getAssociatedExpression())
-                         .getAddress(CGF);
+        Address LB = Address::invalid();
+        if (OAShE) {
+          LB = Address(CGF.EmitScalarExpr(OAShE->getBase()),
+                       CGF.getContext().getTypeAlignInChars(
+                           OAShE->getBase()->getType()));
+        } else {
+          LB = CGF.EmitOMPSharedLValue(I->getAssociatedExpression())
+                   .getAddress(CGF);
+        }
 
         // If this component is a pointer inside the base struct then we don't
         // need to create any entry for it - it will be combined with the object
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 1f06514a38c1..024fc068c217 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -848,55 +848,54 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
           FD->getBody()->getStmtClass() == Stmt::CoroutineBodyStmtClass)
         SanOpts.Mask &= ~SanitizerKind::Null;
 
-  if (D) {
-    // Apply xray attributes to the function (as a string, for now)
-    if (const auto *XRayAttr = D->getAttr<XRayInstrumentAttr>()) {
-      if (CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
-              XRayInstrKind::FunctionEntry) ||
-          CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
-              XRayInstrKind::FunctionExit)) {
-        if (XRayAttr->alwaysXRayInstrument() && ShouldXRayInstrumentFunction())
-          Fn->addFnAttr("function-instrument", "xray-always");
-        if (XRayAttr->neverXRayInstrument())
-          Fn->addFnAttr("function-instrument", "xray-never");
-        if (const auto *LogArgs = D->getAttr<XRayLogArgsAttr>())
-          if (ShouldXRayInstrumentFunction())
-            Fn->addFnAttr("xray-log-args",
-                          llvm::utostr(LogArgs->getArgumentCount()));
-      }
-    } else {
-      if (ShouldXRayInstrumentFunction() && !CGM.imbueXRayAttrs(Fn, Loc))
-        Fn->addFnAttr(
-            "xray-instruction-threshold",
-            llvm::itostr(CGM.getCodeGenOpts().XRayInstructionThreshold));
+  // Apply xray attributes to the function (as a string, for now)
+  if (const auto *XRayAttr = D ? D->getAttr<XRayInstrumentAttr>() : nullptr) {
+    if (CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
+            XRayInstrKind::FunctionEntry) ||
+        CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
+            XRayInstrKind::FunctionExit)) {
+      if (XRayAttr->alwaysXRayInstrument() && ShouldXRayInstrumentFunction())
+        Fn->addFnAttr("function-instrument", "xray-always");
+      if (XRayAttr->neverXRayInstrument())
+        Fn->addFnAttr("function-instrument", "xray-never");
+      if (const auto *LogArgs = D->getAttr<XRayLogArgsAttr>())
+        if (ShouldXRayInstrumentFunction())
+          Fn->addFnAttr("xray-log-args",
+                        llvm::utostr(LogArgs->getArgumentCount()));
     }
+  } else {
+    if (ShouldXRayInstrumentFunction() && !CGM.imbueXRayAttrs(Fn, Loc))
+      Fn->addFnAttr(
+          "xray-instruction-threshold",
+          llvm::itostr(CGM.getCodeGenOpts().XRayInstructionThreshold));
+  }
 
-    if (ShouldXRayInstrumentFunction()) {
-      if (CGM.getCodeGenOpts().XRayIgnoreLoops)
-        Fn->addFnAttr("xray-ignore-loops");
+  if (ShouldXRayInstrumentFunction()) {
+    if (CGM.getCodeGenOpts().XRayIgnoreLoops)
+      Fn->addFnAttr("xray-ignore-loops");
 
-      if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
-              XRayInstrKind::FunctionExit))
-        Fn->addFnAttr("xray-skip-exit");
+    if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
+            XRayInstrKind::FunctionExit))
+      Fn->addFnAttr("xray-skip-exit");
 
-      if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
-              XRayInstrKind::FunctionEntry))
-        Fn->addFnAttr("xray-skip-entry");
-    }
+    if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
+            XRayInstrKind::FunctionEntry))
+      Fn->addFnAttr("xray-skip-entry");
+  }
 
-    unsigned Count, Offset;
-    if (const auto *Attr = D->getAttr<PatchableFunctionEntryAttr>()) {
-      Count = Attr->getCount();
-      Offset = Attr->getOffset();
-    } else {
-      Count = CGM.getCodeGenOpts().PatchableFunctionEntryCount;
-      Offset = CGM.getCodeGenOpts().PatchableFunctionEntryOffset;
-    }
-    if (Count && Offset <= Count) {
-      Fn->addFnAttr("patchable-function-entry", std::to_string(Count - Offset));
-      if (Offset)
-        Fn->addFnAttr("patchable-function-prefix", std::to_string(Offset));
-    }
+  unsigned Count, Offset;
+  if (const auto *Attr =
+          D ? D->getAttr<PatchableFunctionEntryAttr>() : nullptr) {
+    Count = Attr->getCount();
+    Offset = Attr->getOffset();
+  } else {
+    Count = CGM.getCodeGenOpts().PatchableFunctionEntryCount;
+    Offset = CGM.getCodeGenOpts().PatchableFunctionEntryOffset;
+  }
+  if (Count && Offset <= Count) {
+    Fn->addFnAttr("patchable-function-entry", std::to_string(Count - Offset));
+    if (Offset)
+      Fn->addFnAttr("patchable-function-prefix", std::to_string(Offset));
   }
 
   // Add no-jump-tables value.
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 6a0f82a9f9a0..caa71b10f231 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -609,7 +609,11 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     llvm::Type *PointeeType = ConvertTypeForMem(ETy);
     if (PointeeType->isVoidTy())
       PointeeType = llvm::Type::getInt8Ty(getLLVMContext());
-    unsigned AS = Context.getTargetAddressSpace(ETy);
+
+    unsigned AS = PointeeType->isFunctionTy()
+                      ? getDataLayout().getProgramAddressSpace()
+                      : Context.getTargetAddressSpace(ETy);
+
     ResultType = llvm::PointerType::get(PointeeType, AS);
     break;
   }
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 06e4686ac2b9..2cec0dc9de22 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -103,6 +103,18 @@ AMDGPUToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch,
   return DAL;
 }
 
+bool AMDGPUToolChain::getDefaultDenormsAreZeroForTarget(
+    llvm::AMDGPU::GPUKind Kind) {
+  const unsigned ArchAttr = llvm::AMDGPU::getArchAttrAMDGCN(Kind);
+
+  // Default to enabling f32 denormals by default on subtargets where fma is
+  // fast with denormals
+  const bool BothDenormAndFMAFast =
+      (ArchAttr & llvm::AMDGPU::FEATURE_FAST_FMA_F32) &&
+      (ArchAttr & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32);
+  return !BothDenormAndFMAFast;
+}
+
 llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
     const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
     const llvm::fltSemantics *FPType) const {
@@ -121,18 +133,10 @@ llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
   const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ);
   auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
 
-  // Default to enabling f32 denormals by default on subtargets where fma is
-  // fast with denormals
-
-  const unsigned ArchAttr = llvm::AMDGPU::getArchAttrAMDGCN(Kind);
-  const bool DefaultDenormsAreZeroForTarget =
-    (ArchAttr & llvm::AMDGPU::FEATURE_FAST_FMA_F32) &&
-    (ArchAttr & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32);
-
   // TODO: There are way too many flags that change this. Do we need to check
   // them all?
   bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) ||
-             !DefaultDenormsAreZeroForTarget;
+             getDefaultDenormsAreZeroForTarget(Kind);
   // Outputs are flushed to zero, preserving sign
   return DAZ ? llvm::DenormalMode::getPreserveSign() :
                llvm::DenormalMode::getIEEE();
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index 78c40580b302..e7a873efb008 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -13,6 +13,8 @@
 #include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
+#include "llvm/Support/TargetParser.h"
+
 #include <map>
 
 namespace clang {
@@ -67,6 +69,10 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUToolChain : public Generic_ELF {
                              llvm::opt::ArgStringList &CC1Args,
                              Action::OffloadKind DeviceOffloadKind) const override;
 
+  /// Return whether denormals should be flushed, and treated as 0 by default
+  /// for the subtarget.
+  static bool getDefaultDenormsAreZeroForTarget(llvm::AMDGPU::GPUKind GPUKind);
+
   llvm::DenormalMode getDefaultDenormalModeForType(
       const llvm::opt::ArgList &DriverArgs,
       Action::OffloadKind DeviceOffloadKind,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index ce850a8d3ef8..603d04f0a9b3 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3579,9 +3579,9 @@ static void RenderDiagnosticsOptions(const Driver &D, const ArgList &Args,
     CmdArgs.push_back("-fno-diagnostics-fixit-info");
 
   // Enable -fdiagnostics-show-option by default.
-  if (Args.hasFlag(options::OPT_fdiagnostics_show_option,
-                   options::OPT_fno_diagnostics_show_option))
-    CmdArgs.push_back("-fdiagnostics-show-option");
+  if (!Args.hasFlag(options::OPT_fdiagnostics_show_option,
+                    options::OPT_fno_diagnostics_show_option, true))
+    CmdArgs.push_back("-fno-diagnostics-show-option");
 
   if (const Arg *A =
           Args.getLastArg(options::OPT_fdiagnostics_show_category_EQ)) {
@@ -4273,7 +4273,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       StringRef ArgStr =
           Args.hasArg(options::OPT_interface_stub_version_EQ)
               ? Args.getLastArgValue(options::OPT_interface_stub_version_EQ)
-              : "experimental-ifs-v1";
+              : "experimental-ifs-v2";
       CmdArgs.push_back("-emit-interface-stubs");
       CmdArgs.push_back(
           Args.MakeArgString(Twine("-interface-stub-version=") + ArgStr.str()));
@@ -4733,9 +4733,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // Decide whether to use verbose asm. Verbose assembly is the default on
   // toolchains which have the integrated assembler on by default.
   bool IsIntegratedAssemblerDefault = TC.IsIntegratedAssemblerDefault();
-  if (Args.hasFlag(options::OPT_fverbose_asm, options::OPT_fno_verbose_asm,
-                   IsIntegratedAssemblerDefault))
-    CmdArgs.push_back("-masm-verbose");
+  if (!Args.hasFlag(options::OPT_fverbose_asm, options::OPT_fno_verbose_asm,
+                    IsIntegratedAssemblerDefault))
+    CmdArgs.push_back("-fno-verbose-asm");
 
   if (!TC.useIntegratedAs())
     CmdArgs.push_back("-no-integrated-as");
@@ -5220,15 +5220,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   // Pass -fmessage-length=.
-  CmdArgs.push_back("-fmessage-length");
+  unsigned MessageLength = 0;
   if (Arg *A = Args.getLastArg(options::OPT_fmessage_length_EQ)) {
-    CmdArgs.push_back(A->getValue());
+    StringRef V(A->getValue());
+    if (V.getAsInteger(0, MessageLength))
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << V << A->getOption().getName();
   } else {
     // If -fmessage-length=N was not specified, determine whether this is a
     // terminal and, if so, implicitly define -fmessage-length appropriately.
-    unsigned N = llvm::sys::Process::StandardErrColumns();
-    CmdArgs.push_back(Args.MakeArgString(Twine(N)));
+    MessageLength = llvm::sys::Process::StandardErrColumns();
   }
+  if (MessageLength != 0)
+    CmdArgs.push_back(
+        Args.MakeArgString("-fmessage-length=" + Twine(MessageLength)));
 
   // -fvisibility= and -fvisibility-ms-compat are of a piece.
   if (const Arg *A = Args.getLastArg(options::OPT_fvisibility_EQ,
@@ -5701,7 +5706,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_fexperimental_new_pass_manager,
                   options::OPT_fno_experimental_new_pass_manager);
 
-  ObjCRuntime Runtime = AddObjCRuntimeArgs(Args, CmdArgs, rewriteKind);
+  ObjCRuntime Runtime = AddObjCRuntimeArgs(Args, Inputs, CmdArgs, rewriteKind);
   RenderObjCOptions(TC, D, RawTriple, Args, Runtime, rewriteKind != RK_None,
                     Input, CmdArgs);
 
@@ -6408,6 +6413,7 @@ Clang::~Clang() {}
 ///
 /// Returns true if the runtime is non-fragile.
 ObjCRuntime Clang::AddObjCRuntimeArgs(const ArgList &args,
+                                      const InputInfoList &inputs,
                                       ArgStringList &cmdArgs,
                                       RewriteKind rewriteKind) const {
   // Look for the controlling runtime option.
@@ -6531,8 +6537,11 @@ ObjCRuntime Clang::AddObjCRuntimeArgs(const ArgList &args,
       runtime = ObjCRuntime(ObjCRuntime::GCC, VersionTuple());
   }
 
-  cmdArgs.push_back(
-      args.MakeArgString("-fobjc-runtime=" + runtime.getAsString()));
+  if (llvm::any_of(inputs, [](const InputInfo &input) {
+        return types::isObjC(input.getType());
+      }))
+    cmdArgs.push_back(
+        args.MakeArgString("-fobjc-runtime=" + runtime.getAsString()));
   return runtime;
 }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.h b/clang/lib/Driver/ToolChains/Clang.h
index 1552515c1461..64af2fdd5115 100644
--- a/clang/lib/Driver/ToolChains/Clang.h
+++ b/clang/lib/Driver/ToolChains/Clang.h
@@ -77,6 +77,7 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
   enum RewriteKind { RK_None, RK_Fragile, RK_NonFragile };
 
   ObjCRuntime AddObjCRuntimeArgs(const llvm::opt::ArgList &args,
+                                 const InputInfoList &inputs,
                                  llvm::opt::ArgStringList &cmdArgs,
                                  RewriteKind rewrite) const;
 
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 157dca7e0c8d..e4ace81dbac7 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "HIP.h"
+#include "AMDGPU.h"
 #include "CommonArgs.h"
 #include "InputInfo.h"
 #include "clang/Basic/Cuda.h"
@@ -16,6 +17,7 @@
 #include "clang/Driver/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TargetParser.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
@@ -266,7 +268,7 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
 HIPToolChain::HIPToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ToolChain &HostTC, const ArgList &Args)
-    : ToolChain(D, Triple, Args), HostTC(HostTC) {
+    : AMDGPUToolChain(D, Triple, Args), HostTC(HostTC) {
   // Lookup binaries into the driver directory, this is used to
   // discover the clang-offload-bundler executable.
   getProgramPaths().push_back(getDriver().Dir);
@@ -283,6 +285,7 @@ void HIPToolChain::addClangTargetOptions(
   (void) GpuArch;
   assert(DeviceOffloadingKind == Action::OFK_HIP &&
          "Only HIP offloading kinds are supported for GPUs.");
+  auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
 
   CC1Args.push_back("-target-cpu");
   CC1Args.push_back(DriverArgs.MakeArgStringRef(GpuArch));
@@ -343,11 +346,14 @@ void HIPToolChain::addClangTargetOptions(
     std::string GFXVersion = GpuArch.drop_front(3).str();
     std::string ISAVerBC = "oclc_isa_version_" + GFXVersion + ".amdgcn.bc";
 
-    llvm::StringRef FlushDenormalControlBC;
-    if (DriverArgs.hasArg(options::OPT_fcuda_flush_denormals_to_zero))
-      FlushDenormalControlBC = "oclc_daz_opt_on.amdgcn.bc";
-    else
-      FlushDenormalControlBC = "oclc_daz_opt_off.amdgcn.bc";
+    bool FTZDAZ = DriverArgs.hasFlag(
+      options::OPT_fcuda_flush_denormals_to_zero,
+      options::OPT_fno_cuda_flush_denormals_to_zero,
+      getDefaultDenormsAreZeroForTarget(Kind));
+
+    std::string FlushDenormalControlBC = FTZDAZ ?
+      "oclc_daz_opt_on.amdgcn.bc" :
+      "oclc_daz_opt_off.amdgcn.bc";
 
     llvm::StringRef WaveFrontSizeBC;
     if (stoi(GFXVersion) < 1000)
@@ -357,7 +363,7 @@ void HIPToolChain::addClangTargetOptions(
 
     BCLibs.append({"hip.amdgcn.bc", "ocml.amdgcn.bc", "ockl.amdgcn.bc",
                    "oclc_finite_only_off.amdgcn.bc",
-                   std::string(FlushDenormalControlBC),
+                   FlushDenormalControlBC,
                    "oclc_correctly_rounded_sqrt_on.amdgcn.bc",
                    "oclc_unsafe_math_off.amdgcn.bc", ISAVerBC,
                    std::string(WaveFrontSizeBC)});
diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index c4f944e458bf..b6a3a2718635 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -11,6 +11,7 @@
 
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Tool.h"
+#include "AMDGPU.h"
 
 namespace clang {
 namespace driver {
@@ -72,7 +73,7 @@ class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY HIPToolChain : public ToolChain {
+class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public AMDGPUToolChain {
 public:
   HIPToolChain(const Driver &D, const llvm::Triple &Triple,
                 const ToolChain &HostTC, const llvm::opt::ArgList &Args);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 0ff7b179b653..42eb121a8849 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -859,7 +859,7 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.CoverageMapping =
       Args.hasFlag(OPT_fcoverage_mapping, OPT_fno_coverage_mapping, false);
   Opts.DumpCoverageMapping = Args.hasArg(OPT_dump_coverage_mapping);
-  Opts.AsmVerbose = Args.hasArg(OPT_masm_verbose);
+  Opts.AsmVerbose = !Args.hasArg(OPT_fno_verbose_asm);
   Opts.PreserveAsmComments = !Args.hasArg(OPT_fno_preserve_as_comments);
   Opts.AssumeSaneOperatorNew = !Args.hasArg(OPT_fno_assume_sane_operator_new);
   Opts.ObjCAutoRefCountExceptions = Args.hasArg(OPT_fobjc_arc_exceptions);
@@ -1553,7 +1553,7 @@ static bool checkVerifyPrefixes(const std::vector<std::string> &VerifyPrefixes,
 
 bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args,
                                 DiagnosticsEngine *Diags,
-                                bool DefaultDiagColor, bool DefaultShowOpt) {
+                                bool DefaultDiagColor) {
   bool Success = true;
 
   Opts.DiagnosticLogFile =
@@ -1571,9 +1571,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args,
   Opts.ShowFixits = !Args.hasArg(OPT_fno_diagnostics_fixit_info);
   Opts.ShowLocation = !Args.hasArg(OPT_fno_show_source_location);
   Opts.AbsolutePath = Args.hasArg(OPT_fdiagnostics_absolute_paths);
-  Opts.ShowOptionNames =
-      Args.hasFlag(OPT_fdiagnostics_show_option,
-                   OPT_fno_diagnostics_show_option, DefaultShowOpt);
+  Opts.ShowOptionNames = !Args.hasArg(OPT_fno_diagnostics_show_option);
 
   llvm::sys::Process::UseANSIEscapeCodes(Args.hasArg(OPT_fansi_escape_codes));
 
@@ -1681,7 +1679,8 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args,
       Diags->Report(diag::warn_ignoring_ftabstop_value)
       << Opts.TabStop << DiagnosticOptions::DefaultTabStop;
   }
-  Opts.MessageLength = getLastArgIntValue(Args, OPT_fmessage_length, 0, Diags);
+  Opts.MessageLength =
+      getLastArgIntValue(Args, OPT_fmessage_length_EQ, 0, Diags);
   addDiagnosticArgs(Args, OPT_W_Group, OPT_W_value_Group, Opts.Warnings);
   addDiagnosticArgs(Args, OPT_R_Group, OPT_R_value_Group, Opts.Remarks);
 
@@ -1787,25 +1786,26 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
       StringRef ArgStr =
           Args.hasArg(OPT_interface_stub_version_EQ)
               ? Args.getLastArgValue(OPT_interface_stub_version_EQ)
-              : "experimental-ifs-v1";
+              : "experimental-ifs-v2";
       if (ArgStr == "experimental-yaml-elf-v1" ||
+          ArgStr == "experimental-ifs-v1" ||
           ArgStr == "experimental-tapi-elf-v1") {
         std::string ErrorMessage =
             "Invalid interface stub format: " + ArgStr.str() +
             " is deprecated.";
         Diags.Report(diag::err_drv_invalid_value)
             << "Must specify a valid interface stub format type, ie: "
-               "-interface-stub-version=experimental-ifs-v1"
+               "-interface-stub-version=experimental-ifs-v2"
             << ErrorMessage;
-      } else if (ArgStr != "experimental-ifs-v1") {
+      } else if (!ArgStr.startswith("experimental-ifs-")) {
         std::string ErrorMessage =
             "Invalid interface stub format: " + ArgStr.str() + ".";
         Diags.Report(diag::err_drv_invalid_value)
             << "Must specify a valid interface stub format type, ie: "
-               "-interface-stub-version=experimental-ifs-v1"
+               "-interface-stub-version=experimental-ifs-v2"
             << ErrorMessage;
       } else {
-        Opts.ProgramAction = frontend::GenerateInterfaceIfsExpV1;
+        Opts.ProgramAction = frontend::GenerateInterfaceStubs;
       }
       break;
     }
@@ -3385,7 +3385,7 @@ static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) {
   case frontend::GenerateModuleInterface:
   case frontend::GenerateHeaderModule:
   case frontend::GeneratePCH:
-  case frontend::GenerateInterfaceIfsExpV1:
+  case frontend::GenerateInterfaceStubs:
   case frontend::ParseSyntaxOnly:
   case frontend::ModuleFileInfo:
   case frontend::VerifyPCH:
@@ -3613,9 +3613,8 @@ bool CompilerInvocation::CreateFromArgs(CompilerInvocation &Res,
     Diags.Report(diag::err_fe_dependency_file_requires_MT);
     Success = false;
   }
-  Success &=
-      ParseDiagnosticArgs(Res.getDiagnosticOpts(), Args, &Diags,
-                          false /*DefaultDiagColor*/, false /*DefaultShowOpt*/);
+  Success &= ParseDiagnosticArgs(Res.getDiagnosticOpts(), Args, &Diags,
+                                 /*DefaultDiagColor=*/false);
   ParseCommentArgs(LangOpts.CommentOpts, Args);
   ParseFileSystemArgs(Res.getFileSystemOpts(), Args);
   // FIXME: We shouldn't have to pass the DashX option around here
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 1dc85d967ca0..0155238dd0a8 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -1081,6 +1081,9 @@ void WrapperFrontendAction::ExecuteAction() {
 void WrapperFrontendAction::EndSourceFileAction() {
   WrappedAction->EndSourceFileAction();
 }
+bool WrapperFrontendAction::shouldEraseOutputFiles() {
+  return WrappedAction->shouldEraseOutputFiles();
+}
 
 bool WrapperFrontendAction::usesPreprocessorOnly() const {
   return WrappedAction->usesPreprocessorOnly();
diff --git a/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp b/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
index 2b7f0f8f9b66..b7c1e693413b 100644
--- a/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
+++ b/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
@@ -290,7 +290,7 @@ class InterfaceStubFunctionsConsumer : public ASTConsumer {
                              const ASTContext &context, StringRef Format,
                              raw_ostream &OS) -> void {
       OS << "--- !" << Format << "\n";
-      OS << "IfsVersion: 1.0\n";
+      OS << "IfsVersion: 2.0\n";
       OS << "Triple: " << T.str() << "\n";
       OS << "ObjectFileFormat: "
          << "ELF"
@@ -299,11 +299,11 @@ class InterfaceStubFunctionsConsumer : public ASTConsumer {
       for (const auto &E : Symbols) {
         const MangledSymbol &Symbol = E.second;
         for (auto Name : Symbol.Names) {
-          OS << "  \""
+          OS << "  - { Name: \""
              << (Symbol.ParentName.empty() || Instance.getLangOpts().CPlusPlus
                      ? ""
                      : (Symbol.ParentName + "."))
-             << Name << "\" : { Type: ";
+             << Name << "\", Type: ";
           switch (Symbol.Type) {
           default:
             llvm_unreachable(
@@ -330,15 +330,15 @@ class InterfaceStubFunctionsConsumer : public ASTConsumer {
       OS.flush();
     };
 
-    assert(Format == "experimental-ifs-v1" && "Unexpected IFS Format.");
+    assert(Format == "experimental-ifs-v2" && "Unexpected IFS Format.");
     writeIfsV1(Instance.getTarget().getTriple(), Symbols, context, Format, *OS);
   }
 };
 } // namespace
 
 std::unique_ptr<ASTConsumer>
-GenerateInterfaceIfsExpV1Action::CreateASTConsumer(CompilerInstance &CI,
-                                                   StringRef InFile) {
+GenerateInterfaceStubsAction::CreateASTConsumer(CompilerInstance &CI,
+                                                StringRef InFile) {
   return std::make_unique<InterfaceStubFunctionsConsumer>(
-      CI, InFile, "experimental-ifs-v1");
+      CI, InFile, "experimental-ifs-v2");
 }
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index ab7a1e32e301..7c59ae42d2a2 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -65,8 +65,8 @@ CreateFrontendBaseAction(CompilerInstance &CI) {
   case GenerateHeaderModule:
     return std::make_unique<GenerateHeaderModuleAction>();
   case GeneratePCH:            return std::make_unique<GeneratePCHAction>();
-  case GenerateInterfaceIfsExpV1:
-    return std::make_unique<GenerateInterfaceIfsExpV1Action>();
+  case GenerateInterfaceStubs:
+    return std::make_unique<GenerateInterfaceStubsAction>();
   case InitOnly:               return std::make_unique<InitOnlyAction>();
   case ParseSyntaxOnly:        return std::make_unique<SyntaxOnlyAction>();
   case ModuleFileInfo:         return std::make_unique<DumpModuleInfoAction>();
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 3b30ddbd527b..c2c57cadfdf2 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -650,28 +650,28 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_mul(v128_t __a,
   return (v128_t)((__u8x16)__a * (__u8x16)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min_s(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_min_s_i8x16((__i8x16)__a, (__i8x16)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min_u(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_min_u_i8x16((__i8x16)__a, (__i8x16)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max_s(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_max_s_i8x16((__i8x16)__a, (__i8x16)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max_u(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_max_u_i8x16((__i8x16)__a, (__i8x16)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_avgr_u(v128_t __a,
-                                                              v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a,
+                                                            v128_t __b) {
   return (v128_t)__builtin_wasm_avgr_u_i8x16((__i8x16)__a, (__i8x16)__b);
 }
 
@@ -745,28 +745,28 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,
   return (v128_t)((__u16x8)__a * (__u16x8)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min_s(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_min_s_i16x8((__i16x8)__a, (__i16x8)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min_u(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_min_u_i16x8((__i16x8)__a, (__i16x8)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max_s(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_max_s_i16x8((__i16x8)__a, (__i16x8)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max_u(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_max_u_i16x8((__i16x8)__a, (__i16x8)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_avgr_u(v128_t __a,
-                                                              v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a,
+                                                            v128_t __b) {
   return (v128_t)__builtin_wasm_avgr_u_i16x8((__i16x8)__a, (__i16x8)__b);
 }
 
@@ -816,23 +816,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_mul(v128_t __a,
   return (v128_t)((__u32x4)__a * (__u32x4)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min_s(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_min_s_i32x4((__i32x4)__a, (__i32x4)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min_u(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_min_u_i32x4((__i32x4)__a, (__i32x4)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max_s(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_max_s_i32x4((__i32x4)__a, (__i32x4)__b);
 }
 
-static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max_u(v128_t __a,
-                                                             v128_t __b) {
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a,
+                                                           v128_t __b) {
   return (v128_t)__builtin_wasm_max_u_i32x4((__i32x4)__a, (__i32x4)__b);
 }
 
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 9a0f94ae7be5..fc07cfe5ad03 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -3001,7 +3001,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     // Match the ')'.
     T.consumeClose();
     RParenLoc = T.getCloseLocation();
-    Result = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    Result = Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
     if (ErrorFound) {
       Result = ExprError();
     } else if (!Result.isInvalid()) {
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index ff11e97c5783..f4b823768d4c 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1522,48 +1522,65 @@ class DeferredDiagnosticsEmitter
   }
 
   void visitUsedDecl(SourceLocation Loc, Decl *D) {
-    if (auto *FD = dyn_cast<FunctionDecl>(D)) {
-      FunctionDecl *Caller = UseStack.empty() ? nullptr : UseStack.back();
-      auto IsKnownEmitted = S.getEmissionStatus(FD, /*Final=*/true) ==
-                            Sema::FunctionEmissionStatus::Emitted;
-      if (!Caller)
-        ShouldEmit = IsKnownEmitted;
-      if ((!ShouldEmit && !S.getLangOpts().OpenMP && !Caller) ||
-          S.shouldIgnoreInHostDeviceCheck(FD) || Visited.count(D))
-        return;
-      // Finalize analysis of OpenMP-specific constructs.
-      if (Caller && S.LangOpts.OpenMP && UseStack.size() == 1)
-        S.finalizeOpenMPDelayedAnalysis(Caller, FD, Loc);
-      // Finalize analysis of SYCL-specific constructs.
-      if (Caller && S.LangOpts.SYCLIsDevice)
-        S.finalizeSYCLDelayedAnalysis(Caller, FD, Loc);
-      if (Caller)
-        S.DeviceKnownEmittedFns[FD] = {Caller, Loc};
-      if (ShouldEmit || InOMPDeviceContext)
-        S.emitDeferredDiags(FD, Caller);
-      Visited.insert(D);
-      UseStack.push_back(FD);
-      if (auto *S = FD->getBody()) {
-        this->Visit(S);
-      }
-      UseStack.pop_back();
-      Visited.erase(D);
-    } else if (auto *VD = dyn_cast<VarDecl>(D)) {
-      if (auto *Init = VD->getInit()) {
-        if (S.LangOpts.SYCLIsDevice)
-          return;
-        auto DevTy = OMPDeclareTargetDeclAttr::getDeviceType(VD);
-        bool IsDev = DevTy && (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost ||
-                               *DevTy == OMPDeclareTargetDeclAttr::DT_Any);
-        if (IsDev)
-          ++InOMPDeviceContext;
-        this->Visit(Init);
-        if (IsDev)
-          --InOMPDeviceContext;
-      }
-    } else
+    if (isa<VarDecl>(D))
+      return;
+    if (auto *FD = dyn_cast<FunctionDecl>(D))
+      checkFunc(Loc, FD);
+    else
       Inherited::visitUsedDecl(Loc, D);
   }
+
+  void checkVar(VarDecl *VD) {
+    if (S.LangOpts.SYCLIsDevice)
+      return;
+    assert(VD->isFileVarDecl() &&
+           "Should only check file-scope variables");
+    if (auto *Init = VD->getInit()) {
+      auto DevTy = OMPDeclareTargetDeclAttr::getDeviceType(VD);
+      bool IsDev = DevTy && (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost ||
+                             *DevTy == OMPDeclareTargetDeclAttr::DT_Any);
+      if (IsDev)
+        ++InOMPDeviceContext;
+      this->Visit(Init);
+      if (IsDev)
+        --InOMPDeviceContext;
+    }
+  }
+
+  void checkFunc(SourceLocation Loc, FunctionDecl *FD) {
+    FunctionDecl *Caller = UseStack.empty() ? nullptr : UseStack.back();
+    auto IsKnownEmitted = S.getEmissionStatus(FD, /*Final=*/true) ==
+                          Sema::FunctionEmissionStatus::Emitted;
+    if (!Caller)
+      ShouldEmit = IsKnownEmitted;
+    if ((!ShouldEmit && !S.getLangOpts().OpenMP && !Caller) ||
+        S.shouldIgnoreInHostDeviceCheck(FD) || Visited.count(FD))
+      return;
+    // Finalize analysis of OpenMP-specific constructs.
+    if (Caller && S.LangOpts.OpenMP && UseStack.size() == 1)
+      S.finalizeOpenMPDelayedAnalysis(Caller, FD, Loc);
+    // Finalize analysis of SYCL-specific constructs.
+    if (Caller && S.LangOpts.SYCLIsDevice)
+      S.finalizeSYCLDelayedAnalysis(Caller, FD, Loc);
+    if (Caller)
+      S.DeviceKnownEmittedFns[FD] = {Caller, Loc};
+    if (ShouldEmit || InOMPDeviceContext)
+      S.emitDeferredDiags(FD, Caller);
+    Visited.insert(FD);
+    UseStack.push_back(FD);
+    if (auto *S = FD->getBody()) {
+      this->Visit(S);
+    }
+    UseStack.pop_back();
+    Visited.erase(FD);
+  }
+
+  void checkRecordedDecl(Decl *D) {
+    if (auto *FD = dyn_cast<FunctionDecl>(D))
+      checkFunc(SourceLocation(), FD);
+    else
+      checkVar(cast<VarDecl>(D));
+  }
 };
 } // namespace
 
@@ -1579,7 +1596,7 @@ void Sema::emitDeferredDiags() {
 
   DeferredDiagnosticsEmitter DDE(*this);
   for (auto D : DeclsToCheckForDeferredDiags)
-    DDE.visitUsedDecl(SourceLocation(), D);
+    DDE.checkRecordedDecl(D);
 }
 
 // In CUDA, there are some constructs which may appear in semantically-valid
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 74e65eaacfdb..98de799415a0 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -12270,7 +12270,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     VDecl->setInitStyle(VarDecl::ListInit);
   }
 
-  if (LangOpts.OpenMP && VDecl->hasGlobalStorage())
+  if (LangOpts.OpenMP && VDecl->isFileVarDecl())
     DeclsToCheckForDeferredDiags.push_back(VDecl);
   CheckCompleteVariableDeclaration(VDecl);
 }
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 8f13d8a58577..19532dad6de2 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -4835,10 +4835,13 @@ ExprResult Sema::ActOnOMPArrayShapingExpr(Expr *Base, SourceLocation LParenLoc,
   if (!BaseTy->isPointerType() && Base->isTypeDependent())
     return OMPArrayShapingExpr::Create(Context, Context.DependentTy, Base,
                                        LParenLoc, RParenLoc, Dims, Brackets);
-  if (!BaseTy->isPointerType())
+  if (!BaseTy->isPointerType() ||
+      (!Base->isTypeDependent() &&
+       BaseTy->getPointeeType()->isIncompleteType()))
     return ExprError(Diag(Base->getExprLoc(),
                           diag::err_omp_non_pointer_type_array_shaping_base)
                      << Base->getSourceRange());
+
   SmallVector<Expr *, 4> NewDims;
   bool ErrorFound = false;
   for (Expr *Dim : Dims) {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index f9e8e3d6ccc8..7d2ae172fe4d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -1943,7 +1943,8 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
 
           if (isa<ArraySubscriptExpr>(EI->getAssociatedExpression()) ||
               isa<OMPArraySectionExpr>(EI->getAssociatedExpression()) ||
-              isa<MemberExpr>(EI->getAssociatedExpression())) {
+              isa<MemberExpr>(EI->getAssociatedExpression()) ||
+              isa<OMPArrayShapingExpr>(EI->getAssociatedExpression())) {
             IsVariableAssociatedWithSection = true;
             // There is nothing more we need to know about this variable.
             return true;
@@ -3225,7 +3226,7 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
                        StackComponents,
                    OpenMPClauseKind) {
                   // Variable is used if it has been marked as an array, array
-                  // section or the variable iself.
+                  // section, array shaping or the variable iself.
                   return StackComponents.size() == 1 ||
                          std::all_of(
                              std::next(StackComponents.rbegin()),
@@ -3236,6 +3237,8 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
                                           nullptr &&
                                       (isa<OMPArraySectionExpr>(
                                            MC.getAssociatedExpression()) ||
+                                       isa<OMPArrayShapingExpr>(
+                                           MC.getAssociatedExpression()) ||
                                        isa<ArraySubscriptExpr>(
                                            MC.getAssociatedExpression()));
                              });
@@ -3393,8 +3396,10 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
                   // Do both expressions have the same kind?
                   if (CCI->getAssociatedExpression()->getStmtClass() !=
                       SC.getAssociatedExpression()->getStmtClass())
-                    if (!(isa<OMPArraySectionExpr>(
-                              SC.getAssociatedExpression()) &&
+                    if (!((isa<OMPArraySectionExpr>(
+                               SC.getAssociatedExpression()) ||
+                           isa<OMPArrayShapingExpr>(
+                               SC.getAssociatedExpression())) &&
                           isa<ArraySubscriptExpr>(
                               CCI->getAssociatedExpression())))
                       return false;
@@ -16284,6 +16289,15 @@ class MapBaseChecker final : public StmtVisitor<MapBaseChecker, bool> {
     Components.emplace_back(OASE, nullptr);
     return RelevantExpr || Visit(E);
   }
+  bool VisitOMPArrayShapingExpr(OMPArrayShapingExpr *E) {
+    Expr *Base = E->getBase();
+
+    // Record the component - we don't have any declaration associated.
+    Components.emplace_back(E, nullptr);
+
+    return Visit(Base->IgnoreParenImpCasts());
+  }
+
   bool VisitUnaryOperator(UnaryOperator *UO) {
     if (SemaRef.getLangOpts().OpenMP < 50 || !UO->isLValue() ||
         UO->getOpcode() != UO_Deref) {
@@ -16409,9 +16423,11 @@ static bool checkMapConflicts(
           //  variable in map clauses of the same construct.
           if (CurrentRegionOnly &&
               (isa<ArraySubscriptExpr>(CI->getAssociatedExpression()) ||
-               isa<OMPArraySectionExpr>(CI->getAssociatedExpression())) &&
+               isa<OMPArraySectionExpr>(CI->getAssociatedExpression()) ||
+               isa<OMPArrayShapingExpr>(CI->getAssociatedExpression())) &&
               (isa<ArraySubscriptExpr>(SI->getAssociatedExpression()) ||
-               isa<OMPArraySectionExpr>(SI->getAssociatedExpression()))) {
+               isa<OMPArraySectionExpr>(SI->getAssociatedExpression()) ||
+               isa<OMPArrayShapingExpr>(SI->getAssociatedExpression()))) {
             SemaRef.Diag(CI->getAssociatedExpression()->getExprLoc(),
                          diag::err_omp_multiple_array_items_in_map_clause)
                 << CI->getAssociatedExpression()->getSourceRange();
@@ -16443,6 +16459,9 @@ static bool checkMapConflicts(
             const Expr *E = OASE->getBase()->IgnoreParenImpCasts();
             Type =
                 OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
+          } else if (const auto *OASE = dyn_cast<OMPArrayShapingExpr>(
+                         SI->getAssociatedExpression())) {
+            Type = OASE->getBase()->getType()->getPointeeType();
           }
           if (Type.isNull() || Type->isAnyPointerType() ||
               checkArrayExpressionDoesNotReferToWholeSize(
@@ -16905,6 +16924,7 @@ static void checkMappableExpressionList(
     QualType Type;
     auto *ASE = dyn_cast<ArraySubscriptExpr>(VE->IgnoreParens());
     auto *OASE = dyn_cast<OMPArraySectionExpr>(VE->IgnoreParens());
+    auto *OAShE = dyn_cast<OMPArrayShapingExpr>(VE->IgnoreParens());
     if (ASE) {
       Type = ASE->getType().getNonReferenceType();
     } else if (OASE) {
@@ -16915,6 +16935,8 @@ static void checkMappableExpressionList(
       else
         Type = BaseType->getPointeeType();
       Type = Type.getNonReferenceType();
+    } else if (OAShE) {
+      Type = OAShE->getBase()->getType()->getPointeeType();
     } else {
       Type = VE->getType();
     }
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 7437f649a090..bea9bdd22bab 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -8514,7 +8514,7 @@ unsigned ASTReader::getModuleFileID(ModuleFile *F) {
 
 llvm::Optional<ASTSourceDescriptor>
 ASTReader::getSourceDescriptor(unsigned ID) {
-  if (const Module *M = getSubmodule(ID))
+  if (Module *M = getSubmodule(ID))
     return ASTSourceDescriptor(*M);
 
   // If there is only a single PCH, return it instead.
diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp
index 002233e49bb0..d869796b82c1 100644
--- a/clang/lib/Serialization/GeneratePCH.cpp
+++ b/clang/lib/Serialization/GeneratePCH.cpp
@@ -57,6 +57,11 @@ void PCHGenerator::HandleTranslationUnit(ASTContext &Ctx) {
     }
   }
 
+  // Errors that do not prevent the PCH from being written should not cause the
+  // overall compilation to fail either.
+  if (AllowASTWithErrors)
+    PP.getDiagnostics().getClient()->clear();
+
   // Emit the PCH file to the Buffer.
   assert(SemaPtr && "No Sema?");
   Buffer->Signature =
diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index e4b720df6b11..0b8d100992a2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -508,13 +508,7 @@ void NullabilityChecker::checkEvent(ImplicitNullDerefEvent Event) const {
 /// return expressions of ObjC types when the return type of the function or
 /// method is non-null but the express is not.
 static const Expr *lookThroughImplicitCasts(const Expr *E) {
-  assert(E);
-
-  while (auto *ICE = dyn_cast<ImplicitCastExpr>(E)) {
-    E = ICE->getSubExpr();
-  }
-
-  return E;
+  return E->IgnoreImpCasts();
 }
 
 /// This method check when nullable pointer or null value is returned from a
diff --git a/clang/test/Analysis/osobject-retain-release.cpp b/clang/test/Analysis/osobject-retain-release.cpp
index 41606a30c39f..d88349dcd807 100644
--- a/clang/test/Analysis/osobject-retain-release.cpp
+++ b/clang/test/Analysis/osobject-retain-release.cpp
@@ -53,6 +53,9 @@ struct MyArray : public OSArray {
   OSObject *generateObject(OSObject *input) override;
 };
 
+// These are never refcounted.
+struct OSSymbol : OSObject {};
+
 struct OtherStruct {
   static void doNothingToArray(OSArray *array);
   OtherStruct(OSArray *arr);
@@ -754,3 +757,10 @@ void test() {
   b(0);
 }
 } // namespace inherited_constructor_crash
+
+namespace ossymbol_suppression {
+OSSymbol *createSymbol();
+void test() {
+  OSSymbol *sym = createSymbol(); // no-warning
+}
+} // namespace ossymbol_suppression
diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp
index f812ea1bd8be..1afea99e8895 100644
--- a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp
+++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -fdiagnostics-show-option -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
 
 template<typename T>
 struct set{};
diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
index bb6bb73ec702..e3599db18350 100644
--- a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
+++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify -std=c++98 %s
-// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // C++98 [basic.lookup.classref]p1:
 //   In a class member access expression (5.2.5), if the . or -> token is
diff --git a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-cxx03-extra-copy.cpp b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-cxx03-extra-copy.cpp
index 7a5caef36e73..e3190245d240 100644
--- a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-cxx03-extra-copy.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-cxx03-extra-copy.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++03 -fdiagnostics-show-option -Wbind-to-temporary-copy -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++03 -Wbind-to-temporary-copy -verify %s
 
 // C++03 requires that we check for a copy constructor when binding a
 // reference to a temporary, since we are allowed to make a copy, Even
diff --git a/clang/test/CodeGen/builtins-systemz-zvector.c b/clang/test/CodeGen/builtins-systemz-zvector.c
index 6cba71098792..da0e720c9fae 100644
--- a/clang/test/CodeGen/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/builtins-systemz-zvector.c
@@ -3665,31 +3665,31 @@ void test_integer(void) {
   // CHECK-ASM: vsumqg
 
   idx = vec_test_mask(vsc, vuc);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vuc, vuc);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vss, vus);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vus, vus);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vsi, vui);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vui, vui);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vsl, vul);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vul, vul);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vd, vul);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
 }
 
diff --git a/clang/test/CodeGen/builtins-systemz-zvector2.c b/clang/test/CodeGen/builtins-systemz-zvector2.c
index 1880fed64dbc..a4f791e6019b 100644
--- a/clang/test/CodeGen/builtins-systemz-zvector2.c
+++ b/clang/test/CodeGen/builtins-systemz-zvector2.c
@@ -654,10 +654,10 @@ void test_integer(void) {
   // CHECK-ASM: vsrlb
 
   idx = vec_test_mask(vf, vui);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
   idx = vec_test_mask(vd, vul);
-  // CHECK: call signext i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
 
   vuc = vec_msum_u128(vul, vul, vuc, 0);
diff --git a/clang/test/CodeGen/movbe-builtins.c b/clang/test/CodeGen/movbe-builtins.c
index 15f49b84ec67..342f66391388 100644
--- a/clang/test/CodeGen/movbe-builtins.c
+++ b/clang/test/CodeGen/movbe-builtins.c
@@ -7,7 +7,7 @@
 short test_loadbe_i16(const short *P) {
   // CHECK-LABEL: @test_loadbe_i16
   // CHECK: [[LOAD:%.*]] = load i16, i16* %{{.*}}, align 1
-  // CHECK: call signext i16 @llvm.bswap.i16(i16 [[LOAD]])
+  // CHECK: call i16 @llvm.bswap.i16(i16 [[LOAD]])
   return _loadbe_i16(P);
 }
 
diff --git a/clang/test/CodeGen/rot-intrinsics.c b/clang/test/CodeGen/rot-intrinsics.c
index 7b1ffb6ae3a6..dcdc54c4585a 100644
--- a/clang/test/CodeGen/rot-intrinsics.c
+++ b/clang/test/CodeGen/rot-intrinsics.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -ffreestanding -triple i686--linux -emit-llvm -mllvm -update-return-attrs=false %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -ffreestanding -triple x86_64--linux -emit-llvm -mllvm -update-return-attrs=false %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-64BIT-LONG
-// RUN: %clang_cc1 -fms-extensions -fms-compatibility -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -mllvm -update-return-attrs=false -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -mllvm -update-return-attrs=false -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -mllvm -update-return-attrs=false -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -mllvm -update-return-attrs=false -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -ffreestanding -triple i686--linux -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -ffreestanding -triple x86_64--linux -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-64BIT-LONG
+// RUN: %clang_cc1 -fms-extensions -fms-compatibility -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
 
 #include <x86intrin.h>
 
diff --git a/clang/test/CodeGen/xray-global-init.cpp b/clang/test/CodeGen/xray-global-init.cpp
new file mode 100644
index 000000000000..588be8a45a50
--- /dev/null
+++ b/clang/test/CodeGen/xray-global-init.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -triple=x86_64-linux-gnu -emit-llvm -fxray-instrument -fxray-instruction-threshold=1 %s -o - \
+// RUN:   | FileCheck %s
+
+struct A {
+  A();
+  ~A();
+};
+
+A a;
+
+// Check that the xray-instruction-threshold was applied
+// CHECK: define internal void @_GLOBAL__sub_I_xray_global_init.cpp() [[NUX:#[0-9]+]] section ".text.startup" {
+// CHECK: attributes [[NUX]] = { noinline nounwind {{.*}}"xray-instruction-threshold"="1"{{.*}} }
diff --git a/clang/test/CodeGenCXX/debug-info-template-parameter.cpp b/clang/test/CodeGenCXX/debug-info-template-parameter.cpp
index 95e7a187fe10..c38c535d8b06 100644
--- a/clang/test/CodeGenCXX/debug-info-template-parameter.cpp
+++ b/clang/test/CodeGenCXX/debug-info-template-parameter.cpp
@@ -8,22 +8,24 @@
 
 // CHECK: DILocalVariable(name: "f1", {{.*}}, type: ![[TEMPLATE_TYPE:[0-9]+]]
 // CHECK: [[TEMPLATE_TYPE]] = {{.*}}!DICompositeType({{.*}}, templateParams: ![[F1_TYPE:[0-9]+]]
-// CHECK: [[F1_TYPE]] = !{![[FIRST:[0-9]+]], ![[SECOND:[0-9]+]]}
+// CHECK: [[F1_TYPE]] = !{![[FIRST:[0-9]+]], ![[SECOND:[0-9]+]], ![[THIRD:[0-9]+]]}
 // CHECK: [[FIRST]] = !DITemplateTypeParameter(name: "T", type: !{{[0-9]*}})
 // CHECK: [[SECOND]] = !DITemplateValueParameter(name: "i", type: !{{[0-9]*}}, value: i32 6)
+// CHECK: [[THIRD]] = !DITemplateValueParameter(name: "b", type: !{{[0-9]*}}, value: i8 0)
 
 // CHECK: DILocalVariable(name: "f2", {{.*}}, type: ![[TEMPLATE_TYPE:[0-9]+]]
 // CHECK: [[TEMPLATE_TYPE]] = {{.*}}!DICompositeType({{.*}}, templateParams: ![[F2_TYPE:[0-9]+]]
-// CHECK: [[F2_TYPE]] = !{![[FIRST:[0-9]+]], ![[SECOND:[0-9]+]]}
+// CHECK: [[F2_TYPE]] = !{![[FIRST:[0-9]+]], ![[SECOND:[0-9]+]], ![[THIRD:[0-9]+]]}
 // CHECK: [[FIRST]] = !DITemplateTypeParameter(name: "T", type: !{{[0-9]*}}, defaulted: true)
 // CHECK: [[SECOND]] = !DITemplateValueParameter(name: "i", type: !{{[0-9]*}}, defaulted: true, value: i32 3)
+// CHECK: [[THIRD]] = !DITemplateValueParameter(name: "b", type: !{{[0-9]*}}, defaulted: true, value: i8 1)
 
-template <typename T = char, int i = 3>
+template <typename T = char, int i = 3, bool b = true>
 class foo {
 };
 
 int main() {
-  foo<int, 6> f1;
+  foo<int, 6, false> f1;
   foo<> f2;
   return 0;
 }
diff --git a/clang/test/CodeGenObjC/debug-info-class-extension.m b/clang/test/CodeGenObjC/debug-info-class-extension.m
index a27810cce743..db654e6a60a5 100644
--- a/clang/test/CodeGenObjC/debug-info-class-extension.m
+++ b/clang/test/CodeGenObjC/debug-info-class-extension.m
@@ -1,5 +1,5 @@
 // FIXME: Check IR rather than asm, then triple is not needed.
-// RUN: %clang_cc1 -triple %itanium_abi_triple -masm-verbose -S -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK: AT_APPLE_objc_complete_type
 
diff --git a/clang/test/CodeGenObjC/debug-info-class-extension2.m b/clang/test/CodeGenObjC/debug-info-class-extension2.m
index d4750c120f60..ea7865b4ac5b 100644
--- a/clang/test/CodeGenObjC/debug-info-class-extension2.m
+++ b/clang/test/CodeGenObjC/debug-info-class-extension2.m
@@ -1,5 +1,5 @@
 // FIXME: Check IR rather than asm, then triple is not needed.
-// RUN: %clang_cc1 -triple %itanium_abi_triple -masm-verbose -S -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited %s -o - | FileCheck %s
 // CHECK: AT_APPLE_objc_complete_type
 
 @interface Foo {} @end
diff --git a/clang/test/CodeGenObjC/debug-info-class-extension3.m b/clang/test/CodeGenObjC/debug-info-class-extension3.m
index a9cf6f6a5c59..f81445b47a21 100644
--- a/clang/test/CodeGenObjC/debug-info-class-extension3.m
+++ b/clang/test/CodeGenObjC/debug-info-class-extension3.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -masm-verbose -S -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -S -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK-NOT: AT_APPLE_objc_complete_type
 
diff --git a/clang/test/CodeGenObjC/debug-info-property.m b/clang/test/CodeGenObjC/debug-info-property.m
index 9b471be23dbd..ca013b24be42 100644
--- a/clang/test/CodeGenObjC/debug-info-property.m
+++ b/clang/test/CodeGenObjC/debug-info-property.m
@@ -1,5 +1,5 @@
 // FIXME: Check IR rather than asm, then triple is not needed.
-// RUN: %clang_cc1 -triple %itanium_abi_triple -masm-verbose -S -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK: AT_APPLE_property_name
 // CHECK: AT_APPLE_property_attribute
diff --git a/clang/test/CodeGenObjC/debug-info-property2.m b/clang/test/CodeGenObjC/debug-info-property2.m
index 6a15922c932c..7e0a5e9f954b 100644
--- a/clang/test/CodeGenObjC/debug-info-property2.m
+++ b/clang/test/CodeGenObjC/debug-info-property2.m
@@ -1,5 +1,5 @@
 // FIXME: Check IR rather than asm, then triple is not needed.
-// RUN: %clang_cc1 -triple %itanium_abi_triple -masm-verbose -S -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK: AT_APPLE_property_name
 @interface C {
diff --git a/clang/test/CodeGenObjC/debug-info-property4.m b/clang/test/CodeGenObjC/debug-info-property4.m
index f862c85b344d..1f489f2f6b63 100644
--- a/clang/test/CodeGenObjC/debug-info-property4.m
+++ b/clang/test/CodeGenObjC/debug-info-property4.m
@@ -1,5 +1,5 @@
 // FIXME: Check IR rather than asm, then triple is not needed.
-// RUN: %clang_cc1 -triple %itanium_abi_triple -masm-verbose -S -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK: AT_APPLE_property_name
 // CHECK-NOT: AT_APPLE_property_getter
diff --git a/clang/test/CodeGenObjC/debug-info-property5.m b/clang/test/CodeGenObjC/debug-info-property5.m
index 191da9c16fcc..8b70f1ff2082 100644
--- a/clang/test/CodeGenObjC/debug-info-property5.m
+++ b/clang/test/CodeGenObjC/debug-info-property5.m
@@ -1,5 +1,5 @@
 // FIXME: Check IR rather than asm, then triple is not needed.
-// RUN: %clang_cc1 -triple %itanium_abi_triple -masm-verbose -S -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK: AT_APPLE_property_name
 // CHECK: AT_APPLE_property_getter
diff --git a/clang/test/CodeGenObjC/property-dbg.m b/clang/test/CodeGenObjC/property-dbg.m
index fb70747f5db8..f15213131ccc 100644
--- a/clang/test/CodeGenObjC/property-dbg.m
+++ b/clang/test/CodeGenObjC/property-dbg.m
@@ -1,5 +1,5 @@
 // FIXME: Check IR rather than asm, then triple is not needed.
-// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited -masm-verbose -x objective-c < %s | grep DW_AT_name
+// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited -x objective-c < %s | grep DW_AT_name
 @interface Foo {
   int i;
 }
diff --git a/clang/test/Driver/clang-translation.c b/clang/test/Driver/clang-translation.c
index 79d8f6f18ab0..2f02970a2a8e 100644
--- a/clang/test/Driver/clang-translation.c
+++ b/clang/test/Driver/clang-translation.c
@@ -4,7 +4,6 @@
 // I386: "-disable-free"
 // I386: "-mrelocation-model" "static"
 // I386: "-mframe-pointer=all"
-// I386: "-masm-verbose"
 // I386: "-munwind-tables"
 // I386: "-Os"
 // I386: "-fvisibility"
diff --git a/clang/test/Driver/cuda-flush-denormals-to-zero.cu b/clang/test/Driver/cuda-flush-denormals-to-zero.cu
index 74f4bbc1585e..5b1046b0cb12 100644
--- a/clang/test/Driver/cuda-flush-denormals-to-zero.cu
+++ b/clang/test/Driver/cuda-flush-denormals-to-zero.cu
@@ -7,6 +7,16 @@
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
 
+// Test explicit argument.
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
+// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
+// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
+
+// Test the default changing with no argument based on the subtarget.
+// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
+// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
+
 // CPUFTZ-NOT: -fdenormal-fp-math
 
 // FTZ-NOT: -fdenormal-fp-math-f32=
diff --git a/clang/test/Driver/darwin-objc-options.m b/clang/test/Driver/darwin-objc-options.m
index 3e21fb38c0a9..60827f2937ed 100644
--- a/clang/test/Driver/darwin-objc-options.m
+++ b/clang/test/Driver/darwin-objc-options.m
@@ -31,5 +31,12 @@
 // CHECK-CHECK-I386_IOS-NOT: -fobjc-dispatch-method
 // CHECK-CHECK-I386_IOS: darwin-objc-options
 
+/// Don't add -fobjc-runtime for non-ObjC input.
+// RUN: touch %t.c
+// RUN: %clang -target x86_64-apple-darwin -x objective-c -S -### %t.c 2>&1 | FileCheck --check-prefix=F %s
+// RUN: %clang -target x86_64-apple-darwin -S -### %t.c 2>&1 | FileCheck --check-prefix=NO_F %s
+// F: -fobjc-runtime=
+// NO_F-NOT: -fobjc-runtime=
+
 // Don't crash with an unexpected target triple.
 // RUN: %clang -target i386-apple-ios7 -S -### %s
diff --git a/clang/test/Driver/fdiagnostics-show-option.c b/clang/test/Driver/fdiagnostics-show-option.c
new file mode 100644
index 000000000000..a574503cacee
--- /dev/null
+++ b/clang/test/Driver/fdiagnostics-show-option.c
@@ -0,0 +1,7 @@
+/// -fdiagnostics-show-option is the default
+// RUN: %clang -### -c %s 2>&1 | FileCheck --check-prefix=ENABLED %s
+// ENABLED-NOT: "-fno-diagnostics-show-option"
+
+// RUN: %clang -### -c %s -fdiagnostics-show-option -fno-diagnostics-show-option 2>&1 | \
+// RUN:   FileCheck --check-prefix=DISABLED %s
+// DISABLED: "-fno-diagnostics-show-option"
diff --git a/clang/test/Driver/fmessage-length.c b/clang/test/Driver/fmessage-length.c
new file mode 100644
index 000000000000..638add05b2e5
--- /dev/null
+++ b/clang/test/Driver/fmessage-length.c
@@ -0,0 +1,9 @@
+// RUN: %clang -### -c %s -fmessage-length=80 2>&1 | FileCheck %s
+// CHECK: "-fmessage-length=80"
+
+/// Omit -fmessage-length=0 to simplify common CC1 command lines.
+// RUN: %clang -### -c %s -fmessage-length=0 2>&1 | FileCheck --check-prefix=ZERO %s
+// ZERO-NOT: "-fmessage-length=0"
+
+// RUN: %clang -### -c %s -fmessage-length=nan 2>&1 | FileCheck --check-prefix=ERR %s
+// ERR: error: invalid argument 'nan' to -fmessage-length=
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index b79cb70cbe68..cb1747c2d798 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -2,23 +2,94 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 
-// Test flush-denormals-to-zero enabled uses oclc_daz_opt_on
+// Test if oclc_daz_opt_on or if oclc_daz_opt_off is linked depending on
+// expected denormal mode.
 
+// Test subtarget with flushing on by default.
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib   \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,FLUSHD
+
+
+// Test subtarget with flushing off by ddefault.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,NOFLUSHD
+
+
+// Test explicit flag, opposite of target default.
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fcuda-flush-denormals-to-zero \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=COM,FLUSHD
 
-// Test flush-denormals-to-zero disabled uses oclc_daz_opt_off
 
+// Test explicit flag, opposite of target default.
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 \
+// RUN:   -fno-cuda-flush-denormals-to-zero \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,NOFLUSHD
+
+
+// Test explicit flag, same as target default.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
+// RUN:   -fno-cuda-flush-denormals-to-zero \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=COM,NOFLUSHD
 
+
+// Test explicit flag, same as target default.
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 \
+// RUN:   -fcuda-flush-denormals-to-zero \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,FLUSHD
+
+
+// Test last flag wins, not flushing
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 \
+// RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,NOFLUSHD
+
+
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 \
+// RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib   \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,NOFLUSHD
+
+
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 \
+// RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib   \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,FLUSHD
+
+
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 \
+// RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_dev_lib \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=COM,FLUSHD
+
+
 // Test environment variable HIP_DEVICE_LIB_PATH
 
 // RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/hip_dev_lib \
@@ -33,4 +104,3 @@
 // COM-SAME: "-mlink-builtin-bitcode" "{{.*}}ockl.amdgcn.bc"
 // FLUSHD-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_on.amdgcn.bc"
 // NOFLUSHD-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.amdgcn.bc"
-
diff --git a/clang/test/Driver/integrated-as.c b/clang/test/Driver/integrated-as.c
index df5cf1a17ecc..170515579b1a 100644
--- a/clang/test/Driver/integrated-as.c
+++ b/clang/test/Driver/integrated-as.c
@@ -12,6 +12,7 @@
 
 // NOFIAS-NOT: cc1as
 // NOFIAS: -cc1
+// NOFIAS: "-fno-verbose-asm"
 // NOFIAS: -no-integrated-as
 
 // RUN: %clang -target arm-linux-androideabi -### \
diff --git a/clang/test/Driver/rewrite-legacy-objc.m b/clang/test/Driver/rewrite-legacy-objc.m
index dc92dd4bf107..fb7df4b97c06 100644
--- a/clang/test/Driver/rewrite-legacy-objc.m
+++ b/clang/test/Driver/rewrite-legacy-objc.m
@@ -3,11 +3,11 @@
 // TEST0: clang{{.*}}" "-cc1"
 // TEST0: "-rewrite-objc"
 // FIXME: CHECK-NOT is broken somehow, it doesn't work here. Check adjacency instead.
-// TEST0: "-fmessage-length" "0" "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx-fragile" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fmax-type-align=16" "-fdiagnostics-show-option"
+// TEST0: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx-fragile" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fmax-type-align=16"
 // TEST0: rewrite-legacy-objc.m"
 // RUN: %clang -no-canonical-prefixes -target i386-apple-macosx10.9.0 -rewrite-legacy-objc %s -o - -### 2>&1 | \
 // RUN:   FileCheck -check-prefix=TEST1 %s
 // RUN: %clang -no-canonical-prefixes -target i386-apple-macosx10.6.0 -rewrite-legacy-objc %s -o - -### 2>&1 | \
 // RUN:   FileCheck -check-prefix=TEST2 %s
-// TEST1: "-fmessage-length" "0" "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fmax-type-align=16" "-fdiagnostics-show-option"
-// TEST2: "-fmessage-length" "0" "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fmax-type-align=16" "-fdiagnostics-show-option"
+// TEST1: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fmax-type-align=16"
+// TEST2: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fmax-type-align=16"
diff --git a/clang/test/Driver/rewrite-objc.m b/clang/test/Driver/rewrite-objc.m
index b04062992b7f..6073dcdfdafe 100644
--- a/clang/test/Driver/rewrite-objc.m
+++ b/clang/test/Driver/rewrite-objc.m
@@ -3,4 +3,4 @@
 // TEST0: clang{{.*}}" "-cc1"
 // TEST0: "-rewrite-objc"
 // FIXME: CHECK-NOT is broken somehow, it doesn't work here. Check adjacency instead.
-// TEST0: "-fmessage-length" "0" "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fmax-type-align=16" "-fdiagnostics-show-option"
+// TEST0: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1" "-fobjc-runtime=macosx" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fmax-type-align=16"
diff --git a/clang/test/Driver/show-option-names.c b/clang/test/Driver/show-option-names.c
deleted file mode 100644
index 9843a4371f14..000000000000
--- a/clang/test/Driver/show-option-names.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// REQUIRES: x86-registered-target
-
-// RUN: %clang -target x86_64-apple-darwin -fsyntax-only -isysroot /FOO %s 2>&1 | FileCheck --check-prefix=CHECK-SHOW-OPTION-NAMES %s
-// CHECK-SHOW-OPTION-NAMES: warning: no such sysroot directory: '{{([A-Za-z]:.*)?}}/FOO' [-Wmissing-sysroot]
-
-// RUN: %clang -target x86_64-apple-darwin -fsyntax-only -fno-diagnostics-show-option -isysroot /FOO %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SHOW-OPTION-NAMES %s
-// CHECK-NO-SHOW-OPTION-NAMES: warning: no such sysroot directory: '{{([A-Za-z]:.*)?}}/FOO'{{$}}
diff --git a/clang/test/Frontend/diagnostics-option-names.c b/clang/test/Frontend/diagnostics-option-names.c
index ed0d2ed8ef9e..71455be0a75b 100644
--- a/clang/test/Frontend/diagnostics-option-names.c
+++ b/clang/test/Frontend/diagnostics-option-names.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fdiagnostics-show-option -Werror -Weverything %s 2> %t
+// RUN: not %clang_cc1 -Werror -Weverything %s 2> %t
 // RUN: FileCheck < %t %s
 
 int f0(int, unsigned);
diff --git a/clang/test/Frontend/source-col-map.c b/clang/test/Frontend/source-col-map.c
index 1c8078998c56..b257261b8b2b 100644
--- a/clang/test/Frontend/source-col-map.c
+++ b/clang/test/Frontend/source-col-map.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fsyntax-only -fmessage-length 75 -o /dev/null -x c < %s 2>&1 | FileCheck %s -strict-whitespace
+// RUN: not %clang_cc1 -fsyntax-only -fmessage-length=75 -o /dev/null -x c < %s 2>&1 | FileCheck %s -strict-whitespace
 // REQUIRES: utf8-capable-terminal
 
 // Test case for the text diagnostics source column conversion crash.
diff --git a/clang/test/Index/pch-with-errors.c b/clang/test/Index/pch-with-errors.c
index 5c94a8a8e4d3..e8711c8e26a9 100644
--- a/clang/test/Index/pch-with-errors.c
+++ b/clang/test/Index/pch-with-errors.c
@@ -42,3 +42,6 @@ void foo(void) {
 
 // RUN: not c-index-test -write-pch %t.pch foobar.c 2>&1 | FileCheck -check-prefix=NONEXISTENT %s
 // NONEXISTENT: Unable to load translation unit
+
+// RUN: %clang -x c-header %s -o %t-clang.h.pch -Xclang -detailed-preprocessing-record -Xclang -fallow-pch-with-compiler-errors
+// RUN: c-index-test -index-file %s -include %t-clang.h -Xclang -detailed-preprocessing-record | FileCheck -check-prefix=CHECK-INDEX %s
diff --git a/clang/test/InterfaceStubs/bad-format.cpp b/clang/test/InterfaceStubs/bad-format.cpp
index 4d51ac867eb2..1289067a365a 100644
--- a/clang/test/InterfaceStubs/bad-format.cpp
+++ b/clang/test/InterfaceStubs/bad-format.cpp
@@ -7,6 +7,9 @@
 // RUN: not %clang -emit-interface-stubs -interface-stub-version=experimental-yaml-elf-v1 %s 2>&1 | \
 // RUN: FileCheck -check-prefix=CHECK-YAML-DEPRECATED %s
 
+// RUN: not %clang -emit-interface-stubs -interface-stub-version=experimental-ifs-v1 %s 2>&1 | \
+// RUN: FileCheck -check-prefix=CHECK-V1-DEPRECATED %s
+
 // RUN: not %clang -emit-interface-stubs -interface-stub-version=bad-format %s 2>&1 | \
 // RUN: FileCheck %s
 
@@ -21,16 +24,22 @@
 // CHECK: error: invalid value
 // CHECK: 'Invalid interface stub format: bad-format.' in 'Must specify a
 // CHECK: valid interface stub format type, ie:
-// CHECK: -interface-stub-version=experimental-ifs-v1'
+// CHECK: -interface-stub-version=experimental-ifs-v2'
 
 // CHECK-TAPI-DEPRECATED: error: invalid value
 // CHECK-TAPI-DEPRECATED: 'Invalid interface stub format:
 // CHECK-TAPI-DEPRECATED: experimental-tapi-elf-v1 is deprecated.' in 'Must
 // CHECK-TAPI-DEPRECATED: specify a valid interface stub format type, ie:
-// CHECK-TAPI-DEPRECATED: -interface-stub-version=experimental-ifs-v1'
+// CHECK-TAPI-DEPRECATED: -interface-stub-version=experimental-ifs-v2'
 
 // CHECK-YAML-DEPRECATED: error: invalid value
 // CHECK-YAML-DEPRECATED: 'Invalid interface stub format:
 // CHECK-YAML-DEPRECATED: experimental-yaml-elf-v1 is deprecated.' in 'Must
 // CHECK-YAML-DEPRECATED: specify a valid interface stub format type, ie:
-// CHECK-YAML-DEPRECATED: -interface-stub-version=experimental-ifs-v1'
+// CHECK-YAML-DEPRECATED: -interface-stub-version=experimental-ifs-v2'
+
+// CHECK-V1-DEPRECATED: error: invalid value
+// CHECK-V1-DEPRECATED: 'Invalid interface stub format:
+// CHECK-V1-DEPRECATED: experimental-ifs-v1 is deprecated.' in 'Must
+// CHECK-V1-DEPRECATED: specify a valid interface stub format type, ie:
+// CHECK-V1-DEPRECATED: -interface-stub-version=experimental-ifs-v2'
diff --git a/clang/test/InterfaceStubs/blocks.c b/clang/test/InterfaceStubs/blocks.c
index 927f2bf28869..8e2a01159aab 100644
--- a/clang/test/InterfaceStubs/blocks.c
+++ b/clang/test/InterfaceStubs/blocks.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -emit-interface-stubs -fblocks -o - %s | FileCheck %s
 
-// CHECK: --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK: --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/class-template-partial-specialization.cpp b/clang/test/InterfaceStubs/class-template-partial-specialization.cpp
index 4c0edaa2dd8f..b6580861de8b 100644
--- a/clang/test/InterfaceStubs/class-template-partial-specialization.cpp
+++ b/clang/test/InterfaceStubs/class-template-partial-specialization.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/conflict-type.ifs b/clang/test/InterfaceStubs/conflict-type.ifs
index aaa04775e317..cc6191900a30 100644
--- a/clang/test/InterfaceStubs/conflict-type.ifs
+++ b/clang/test/InterfaceStubs/conflict-type.ifs
@@ -7,10 +7,10 @@
 # CHECK-IFS-NEXT: Filename:
 # CHECK-IFS-NEXT: Type Values: Object Func
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  a: { Type: Object, Size: 1 }
+  - { Name: a, Type: Object, Size: 1 }
 ...
diff --git a/clang/test/InterfaceStubs/constructor-using-shadow.cpp b/clang/test/InterfaceStubs/constructor-using-shadow.cpp
index d4b85ac73e56..e806cc323ee7 100644
--- a/clang/test/InterfaceStubs/constructor-using-shadow.cpp
+++ b/clang/test/InterfaceStubs/constructor-using-shadow.cpp
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
 // CHECK-NEXT: ...
 
- // ConstructorUsingShadowDecl
+// ConstructorUsingShadowDecl
 struct Base { Base(int); };
 struct Derived : public Base { using Base::Base; };
diff --git a/clang/test/InterfaceStubs/cxx-conversion.cpp b/clang/test/InterfaceStubs/cxx-conversion.cpp
index 96425a42b6fc..f9de07d17850 100644
--- a/clang/test/InterfaceStubs/cxx-conversion.cpp
+++ b/clang/test/InterfaceStubs/cxx-conversion.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/cxxdeduction-guide.cpp b/clang/test/InterfaceStubs/cxxdeduction-guide.cpp
index f09b9d929ca3..4d9f24bae5b3 100644
--- a/clang/test/InterfaceStubs/cxxdeduction-guide.cpp
+++ b/clang/test/InterfaceStubs/cxxdeduction-guide.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs -std=c++17 %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/driver-test3.c b/clang/test/InterfaceStubs/driver-test3.c
index bccd1c9bccd4..a3f3966dbe8f 100644
--- a/clang/test/InterfaceStubs/driver-test3.c
+++ b/clang/test/InterfaceStubs/driver-test3.c
@@ -8,12 +8,12 @@
 
 // CHECK-OBJ: bar
 
-// CHECK-IFS: --- !experimental-ifs-v1
+// CHECK-IFS: --- !experimental-ifs-v2
 // CHECK-IFS-NEXT: IfsVersion:
 // CHECK-IFS-NEXT: Triple:
 // CHECK-IFS-NEXT: ObjectFileFormat:
 // CHECK-IFS-NEXT: Symbols:
-// CHECK-IFS-NEXT:   "bar" : { Type: Func }
+// CHECK-IFS-NEXT:   - { Name: "bar", Type: Func }
 // CHECK-IFS-NEXT: ...
 
 int bar(int a) { return a; }
diff --git a/clang/test/InterfaceStubs/empty.c b/clang/test/InterfaceStubs/empty.c
new file mode 100644
index 000000000000..c68c124e513e
--- /dev/null
+++ b/clang/test/InterfaceStubs/empty.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
+
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
+// CHECK-NEXT: Triple:
+// CHECK-NEXT: ObjectFileFormat:
+// CHECK-NEXT: Symbols:
+// CHECK-NEXT: ...
diff --git a/clang/test/InterfaceStubs/func.ifs b/clang/test/InterfaceStubs/func.ifs
index d115523bfda4..9de5213de9d6 100644
--- a/clang/test/InterfaceStubs/func.ifs
+++ b/clang/test/InterfaceStubs/func.ifs
@@ -7,13 +7,13 @@
 # RUN: %clang -emit-interface-stubs -o - %s %s -emit-merged-ifs | \
 # RUN: FileCheck %s --check-prefixes=CHECK-MERGE-IFS
 
-# CHECK-IFS: --- !experimental-ifs-v1
-# CHECK-IFS-NEXT: IfsVersion:      1.0
+# CHECK-IFS: --- !experimental-ifs-v2
+# CHECK-IFS-NEXT: IfsVersion: 2.0
 # CHECK-IFS-NEXT: Triple:          x86_64-linux-gnu
 # CHECK-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-IFS-NEXT: Symbols:
-# CHECK-IFS-DAG:   a:               { Type: Func }
-# CHECK-IFS-DAG:   b:               { Type: Object, Size: 4 }
+# CHECK-IFS-DAG:   - { Name: a, Type: Func }
+# CHECK-IFS-DAG:   - { Name: b, Type: Object, Size: 4 }
 # CHECK-IFS: ...
 
 # CHECK-ELF: ELF Header:
@@ -23,18 +23,18 @@
 # CHECK-ELF:   OBJECT  GLOBAL DEFAULT  1 b
 
 # Here we are testing to see if two identical symbols will merge.
-# CHECK-MERGE-IFS: --- !experimental-ifs-v1
-# CHECK-MERGE-IFS-NEXT: IfsVersion:      1.0
+# CHECK-MERGE-IFS: --- !experimental-ifs-v2
+# CHECK-MERGE-IFS-NEXT: IfsVersion: 2.0
 # CHECK-MERGE-IFS-NEXT: Triple:          x86_64-linux-gnu
 # CHECK-MERGE-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-MERGE-IFS-NEXT: Symbols:
-# CHECK-MERGE-IFS-NEXT:   a:               { Type: Func }
+# CHECK-MERGE-IFS-NEXT:   - { Name: a, Type: Func }
 # CHECK-MERGE-IFS-NEXT: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  a: { Type: Func }
+  - { Name: a, Type: Func }
 ...
diff --git a/clang/test/InterfaceStubs/hidden-class-inheritance.cpp b/clang/test/InterfaceStubs/hidden-class-inheritance.cpp
index 19ba579608ec..2219fd5b2e8a 100644
--- a/clang/test/InterfaceStubs/hidden-class-inheritance.cpp
+++ b/clang/test/InterfaceStubs/hidden-class-inheritance.cpp
@@ -14,7 +14,7 @@
 // RUN: -DPARENT_METHOD_VISIBILITY="" -DCHILD_METHOD_VISIBILITY="" %s | \
 // RUN: FileCheck -check-prefix=CHECK-HP %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -o - -emit-interface-stubs \
-// RUN: -interface-stub-version=experimental-ifs-v1 \
+// RUN: -interface-stub-version=experimental-ifs-v2 \
 // RUN: -DPARENT_CLASS_VISIBILITY=HIDDEN -DCHILD_CLASS_VISIBILITY="" \
 // RUN: -DPARENT_METHOD_VISIBILITY="" -DCHILD_METHOD_VISIBILITY="" %s | \
 // RUN: FileCheck -check-prefix=CHECK-HP2 %s
diff --git a/clang/test/InterfaceStubs/indirect-field-decl.cpp b/clang/test/InterfaceStubs/indirect-field-decl.cpp
index d0e5fd26e4b7..2c30b0ee4005 100644
--- a/clang/test/InterfaceStubs/indirect-field-decl.cpp
+++ b/clang/test/InterfaceStubs/indirect-field-decl.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/inline.c b/clang/test/InterfaceStubs/inline.c
index 0b0ac83726ad..1dec4ae677d7 100644
--- a/clang/test/InterfaceStubs/inline.c
+++ b/clang/test/InterfaceStubs/inline.c
@@ -55,8 +55,8 @@ INLINE int foo() {
 // RUN: -c -std=gnu89 -xc %s | llvm-nm - 2>&1 | \
 // RUN: FileCheck -check-prefix=CHECK-SYMBOLS %s
 
-// CHECK-TAPI-DAG: foo" : { Type: Func }
-// CHECK-TAPI-DAG: foo.var" : { Type: Object, Size: 4 }
+// CHECK-TAPI-DAG: foo", Type: Func }
+// CHECK-TAPI-DAG: foo.var",  Type: Object, Size: 4 }
 // CHECK-SYMBOLS-DAG: foo
 // CHECK-SYMBOLS-DAG: foo.var
 #include "inline.h"
diff --git a/clang/test/InterfaceStubs/lambda.cpp b/clang/test/InterfaceStubs/lambda.cpp
index e892f1eee11c..a167f6556b94 100644
--- a/clang/test/InterfaceStubs/lambda.cpp
+++ b/clang/test/InterfaceStubs/lambda.cpp
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-interface-stubs -o - %s \
 // RUN:     | FileCheck %s
 
-// CHECK: --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK: --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
-// CHECK-NEXT:   f" : { Type: Object, Size: 1 }
+// CHECK-NEXT:   f", Type: Object, Size: 1 }
 // CHECK-NEXT: ...
 auto f = [](void* data) { int i; };
diff --git a/clang/test/InterfaceStubs/namespace-alias.cpp b/clang/test/InterfaceStubs/namespace-alias.cpp
index 6a7f27c9b7b0..a4e05f904701 100644
--- a/clang/test/InterfaceStubs/namespace-alias.cpp
+++ b/clang/test/InterfaceStubs/namespace-alias.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/namespace.cpp b/clang/test/InterfaceStubs/namespace.cpp
index 1c62346d22fc..ad4db24ff7de 100644
--- a/clang/test/InterfaceStubs/namespace.cpp
+++ b/clang/test/InterfaceStubs/namespace.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/non-type-template-parm-decl.cpp b/clang/test/InterfaceStubs/non-type-template-parm-decl.cpp
index 51176ac0ba0b..6390099dee5f 100644
--- a/clang/test/InterfaceStubs/non-type-template-parm-decl.cpp
+++ b/clang/test/InterfaceStubs/non-type-template-parm-decl.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/object.c b/clang/test/InterfaceStubs/object.c
index d6e28c5f884a..45e2d38ba3e9 100644
--- a/clang/test/InterfaceStubs/object.c
+++ b/clang/test/InterfaceStubs/object.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fvisibility default -o - -emit-interface-stubs %s | FileCheck -check-prefix=CHECK-TAPI %s
 // RUN: %clang -fvisibility=default -c -o - %s | llvm-nm - 2>&1 | FileCheck -check-prefix=CHECK-SYMBOLS %s
 
-// CHECK-TAPI: data" : { Type: Object, Size: 4 }
+// CHECK-TAPI: data", Type: Object, Size: 4 }
 // CHECK-SYMBOLS: data
 int data = 42;
diff --git a/clang/test/InterfaceStubs/object.ifs b/clang/test/InterfaceStubs/object.ifs
index 7dc1134bac93..3afdf4e65eef 100644
--- a/clang/test/InterfaceStubs/object.ifs
+++ b/clang/test/InterfaceStubs/object.ifs
@@ -4,12 +4,12 @@
 # RUN: %clang -emit-interface-stubs -o - %s | llvm-readelf --all | \
 # RUN: FileCheck %s --check-prefixes=CHECK-ELF
 
-# CHECK-IFS: --- !experimental-ifs-v1
-# CHECK-IFS-NEXT: IfsVersion:      1.0
+# CHECK-IFS: --- !experimental-ifs-v2
+# CHECK-IFS-NEXT: IfsVersion: 2.0
 # CHECK-IFS-NEXT: Triple: x86_64-linux-gnu
 # CHECK-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-IFS-NEXT: Symbols:
-# CHECK-IFS-NEXT:   b:               { Type: Object, Size: 4 }
+# CHECK-IFS-NEXT:   - { Name: b, Type: Object, Size: 4 }
 # CHECK-IFS-NEXT: ...
 
 # CHECK-ELF: ELF Header:
@@ -19,10 +19,10 @@
 # CHECK-ELF-NOT:   FUNC    GLOBAL DEFAULT  1 a
 # CHECK-ELF:   OBJECT  GLOBAL DEFAULT  1 b
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  b: { Type: Object, Size: 4 }
+  - { Name: b, Type: Object, Size: 4 }
 ...
diff --git a/clang/test/InterfaceStubs/ppc.cpp b/clang/test/InterfaceStubs/ppc.cpp
index 9a91697d9506..8b7a276bb054 100644
--- a/clang/test/InterfaceStubs/ppc.cpp
+++ b/clang/test/InterfaceStubs/ppc.cpp
@@ -4,11 +4,11 @@
 // RUN:   -emit-interface-stubs -emit-merged-ifs -S | \
 // RUN: FileCheck -check-prefix=CHECK-IFS %s
 
- // CHECK-IFS: --- !experimental-ifs-v1
- // CHECK-IFS: IfsVersion:      1.0
- // CHECK-IFS: Triple: powerpc64le
- // CHECK-IFS: Symbols:
- // CHECK-IFS:   _Z8helloPPCv: { Type: Func }
- // CHECK-IFS: ...
+// CHECK-IFS: --- !experimental-ifs-v2
+// CHECK-IFS: IfsVersion: 2.0
+// CHECK-IFS: Triple: powerpc64le
+// CHECK-IFS: Symbols:
+// CHECK-IFS:   - { Name: _Z8helloPPCv, Type: Func }
+// CHECK-IFS: ...
 
 int helloPPC();
diff --git a/clang/test/InterfaceStubs/template-constexpr.cpp b/clang/test/InterfaceStubs/template-constexpr.cpp
index c4c7afa42f1e..f59a55b2bb45 100644
--- a/clang/test/InterfaceStubs/template-constexpr.cpp
+++ b/clang/test/InterfaceStubs/template-constexpr.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/template-namespace-function.cpp b/clang/test/InterfaceStubs/template-namespace-function.cpp
index 47788d4a3e0a..68f017c4d5ec 100644
--- a/clang/test/InterfaceStubs/template-namespace-function.cpp
+++ b/clang/test/InterfaceStubs/template-namespace-function.cpp
@@ -6,10 +6,10 @@
 // RUN: FileCheck -check-prefix=CHECK-SYMBOLS %s
 
 // CHECK: Symbols:
-// CHECK-DAG:  "_ZN3qux3barEii" : { Type: Func }
-// CHECK-DAG:  "_ZN3baz3addIiEET_S1_S1_" : { Type: Func }
-// CHECK-DAG:  "_Z4fbarff" : { Type: Func }
-// CHECK-DAG:  "_ZN3baz3addIfEET_S1_S1_" : { Type: Func }
+// CHECK-DAG:  - { Name: "_ZN3qux3barEii", Type: Func }
+// CHECK-DAG:  - { Name: "_ZN3baz3addIiEET_S1_S1_",  Type: Func }
+// CHECK-DAG:  - { Name: "_Z4fbarff", Type: Func }
+// CHECK-DAG:  - { Name: "_ZN3baz3addIfEET_S1_S1_", Type: Func }
 
 // Same symbols just different order.
 // CHECK-SYMBOLS-DAG:  _Z4fbarff
diff --git a/clang/test/InterfaceStubs/template-template-parm-decl.cpp b/clang/test/InterfaceStubs/template-template-parm-decl.cpp
index 63883536a816..5451ec6178e2 100644
--- a/clang/test/InterfaceStubs/template-template-parm-decl.cpp
+++ b/clang/test/InterfaceStubs/template-template-parm-decl.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/trycatch.cpp b/clang/test/InterfaceStubs/trycatch.cpp
index 57076a097cb5..dac7806926a5 100644
--- a/clang/test/InterfaceStubs/trycatch.cpp
+++ b/clang/test/InterfaceStubs/trycatch.cpp
@@ -2,13 +2,12 @@
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -o - -emit-interface-stubs %s | FileCheck %s
 
-
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple: x86_64-unknown-linux-gnu
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
-// CHECK-NEXT: "_Z1fv" : { Type: Func }
+// CHECK-NEXT: - { Name: "_Z1fv", Type: Func }
 // CHECK-NEXT: ...
 
 class C5 {};
diff --git a/clang/test/InterfaceStubs/unresolved-using-typename.cpp b/clang/test/InterfaceStubs/unresolved-using-typename.cpp
index e6afc781412a..d4aad84d7211 100644
--- a/clang/test/InterfaceStubs/unresolved-using-typename.cpp
+++ b/clang/test/InterfaceStubs/unresolved-using-typename.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/usings.cpp b/clang/test/InterfaceStubs/usings.cpp
index 735a040c91dc..2ef83207fcb3 100644
--- a/clang/test/InterfaceStubs/usings.cpp
+++ b/clang/test/InterfaceStubs/usings.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple:
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
diff --git a/clang/test/InterfaceStubs/var-template-specialization-decl.cpp b/clang/test/InterfaceStubs/var-template-specialization-decl.cpp
index bbb5ae888977..9b67dac9865c 100644
--- a/clang/test/InterfaceStubs/var-template-specialization-decl.cpp
+++ b/clang/test/InterfaceStubs/var-template-specialization-decl.cpp
@@ -1,12 +1,12 @@
 // REQUIRES: x86-registered-target
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -o - -emit-interface-stubs %s | FileCheck %s
 
-// CHECK:      --- !experimental-ifs-v1
-// CHECK-NEXT: IfsVersion: 1.0
+// CHECK:      --- !experimental-ifs-v2
+// CHECK-NEXT: IfsVersion: 2.0
 // CHECK-NEXT: Triple: x86_64-unknown-linux-gnu
 // CHECK-NEXT: ObjectFileFormat: ELF
 // CHECK-NEXT: Symbols:
-// CHECK-NEXT: "a" : { Type: Object, Size: 4 }
+// CHECK-NEXT: - { Name: "a", Type: Object, Size: 4 }
 // CHECK-NEXT: ...
 
 template<typename T, T v> struct S9 {
diff --git a/clang/test/InterfaceStubs/weak.cpp b/clang/test/InterfaceStubs/weak.cpp
index 1581ffa9d5d7..e3c0413b6511 100644
--- a/clang/test/InterfaceStubs/weak.cpp
+++ b/clang/test/InterfaceStubs/weak.cpp
@@ -1,14 +1,14 @@
 // REQUIRES: x86-registered-target
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -o - -emit-interface-stubs \
-// RUN: -interface-stub-version=experimental-ifs-v1 %s | \
+// RUN: -interface-stub-version=experimental-ifs-v2 %s | \
 // RUN: FileCheck %s
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -o - -c %s | llvm-nm - 2>&1 | \
 // RUN: FileCheck -check-prefix=CHECK-SYMBOLS %s
 
 // CHECK: Symbols:
-// CHECK-DAG:  "_Z8weakFuncv" : { Type: Func, Weak: true }
-// CHECK-DAG:  "_Z10strongFuncv" : { Type: Func }
+// CHECK-DAG:  - { Name: "_Z8weakFuncv", Type: Func, Weak: true }
+// CHECK-DAG:  - { Name: "_Z10strongFuncv", Type: Func }
 
 // CHECK-SYMBOLS-DAG: _Z10strongFuncv
 // CHECK-SYMBOLS-DAG: _Z8weakFuncv
diff --git a/clang/test/InterfaceStubs/windows.cpp b/clang/test/InterfaceStubs/windows.cpp
index c81c702861e4..73f3ed5d39ac 100644
--- a/clang/test/InterfaceStubs/windows.cpp
+++ b/clang/test/InterfaceStubs/windows.cpp
@@ -6,11 +6,11 @@
 // CHECK-CC1: Symbols:
 // CHECK-CC1-NEXT: ?helloWindowsMsvc@@YAHXZ
 
- // CHECK-IFS: --- !experimental-ifs-v1
- // CHECK-IFS: IfsVersion:      1.0
- // CHECK-IFS: Triple:
- // CHECK-IFS: Symbols:
- // CHECK-IFS:   ?helloWindowsMsvc@@YAHXZ: { Type: Func }
- // CHECK-IFS: ...
+// CHECK-IFS: --- !experimental-ifs-v2
+// CHECK-IFS: IfsVersion: 2.0
+// CHECK-IFS: Triple:
+// CHECK-IFS: Symbols:
+// CHECK-IFS:   - { Name: '?helloWindowsMsvc@@YAHXZ', Type: Func }
+// CHECK-IFS: ...
 
 int helloWindowsMsvc();
diff --git a/clang/test/Misc/diag-line-wrapping.cpp b/clang/test/Misc/diag-line-wrapping.cpp
index 2bcb03f9781c..9e8cb9b53da5 100644
--- a/clang/test/Misc/diag-line-wrapping.cpp
+++ b/clang/test/Misc/diag-line-wrapping.cpp
@@ -1,5 +1,5 @@
-// RUN: not %clang_cc1 -fsyntax-only -fmessage-length 60 %s 2>&1 | FileCheck %s
-// RUN: not %clang_cc1 -fsyntax-only -fmessage-length 0 %s 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -fsyntax-only -fmessage-length=60 %s 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -fsyntax-only -fmessage-length=0 %s 2>&1 | FileCheck %s
 
 struct B { void f(); };
 struct D1 : B {};
diff --git a/clang/test/Misc/message-length.c b/clang/test/Misc/message-length.c
index a6f4f44e6b9c..1e0b4edb7c03 100644
--- a/clang/test/Misc/message-length.c
+++ b/clang/test/Misc/message-length.c
@@ -1,6 +1,6 @@
-// RUN: not %clang_cc1 -fmessage-length 72 %s 2>&1 | FileCheck -strict-whitespace %s
-// RUN: not %clang_cc1 -fmessage-length 1 %s
-// RUN: not %clang_cc1 -fmessage-length 8 %s 2>&1 | FileCheck -check-prefix=CHECK-DOT %s
+// RUN: not %clang_cc1 -fmessage-length=72 %s 2>&1 | FileCheck -strict-whitespace %s
+// RUN: not %clang_cc1 -fmessage-length=1 %s
+// RUN: not %clang_cc1 -fmessage-length=8 %s 2>&1 | FileCheck -check-prefix=CHECK-DOT %s
 // Hack so we can check things better, force the file name and line.
 # 1 "FILE" 1
 
diff --git a/clang/test/Misc/show-diag-options.c b/clang/test/Misc/show-diag-options.c
index 8f05fbc76b56..4e98d63195f1 100644
--- a/clang/test/Misc/show-diag-options.c
+++ b/clang/test/Misc/show-diag-options.c
@@ -1,16 +1,16 @@
-// RUN: %clang_cc1 -fsyntax-only %s 2>&1 \
+// RUN: %clang_cc1 -fsyntax-only -fno-diagnostics-show-option %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BASE
-// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option %s 2>&1 \
+// RUN: %clang_cc1 -fsyntax-only %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=OPTION
-// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-show-option -Werror %s 2>&1 \
+// RUN: not %clang_cc1 -fsyntax-only -Werror %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=OPTION_ERROR
-// RUN: %clang_cc1 -fsyntax-only -std=c89 -pedantic -fdiagnostics-show-option %s 2>&1 \
+// RUN: %clang_cc1 -fsyntax-only -std=c89 -pedantic %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=OPTION_PEDANTIC
-// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-category id %s 2>&1 \
+// RUN: %clang_cc1 -fsyntax-only -fno-diagnostics-show-option -fdiagnostics-show-category id %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CATEGORY_ID
-// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-category name %s 2>&1 \
+// RUN: %clang_cc1 -fsyntax-only -fno-diagnostics-show-option -fdiagnostics-show-category name %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CATEGORY_NAME
-// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-show-option -fdiagnostics-show-category name -Werror %s 2>&1 \
+// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-show-category name -Werror %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=OPTION_ERROR_CATEGORY
 
 void test(int x, int y) {
diff --git a/clang/test/Misc/unnecessary-elipses.cpp b/clang/test/Misc/unnecessary-elipses.cpp
index 2ee725869b5c..c8c178c37f6c 100644
--- a/clang/test/Misc/unnecessary-elipses.cpp
+++ b/clang/test/Misc/unnecessary-elipses.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fmessage-length 80 %s 2>&1 | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -fsyntax-only -fmessage-length=80 %s 2>&1 | FileCheck -strict-whitespace %s
 
 int main() {
     "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
@@ -12,4 +12,4 @@ int main() {
 
         "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"        ;
 // CHECK: {{^  ..."xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"...}}
-}
\ No newline at end of file
+}
diff --git a/clang/test/Misc/unprintable.c b/clang/test/Misc/unprintable.c
index eaa4f34d8028..30e449456630 100644
--- a/clang/test/Misc/unprintable.c
+++ b/clang/test/Misc/unprintable.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 %s -fmessage-length 40 2>&1 | FileCheck -strict-whitespace %s
+// RUN: not %clang_cc1 %s -fmessage-length=40 2>&1 | FileCheck -strict-whitespace %s
 
 int main() {
     int i;
diff --git a/clang/test/Misc/wrong-encoding2.c b/clang/test/Misc/wrong-encoding2.c
index 43a0f4e900ed..b60ed7f92b86 100644
--- a/clang/test/Misc/wrong-encoding2.c
+++ b/clang/test/Misc/wrong-encoding2.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fmessage-length 100 %s 2>&1 | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -fsyntax-only -fmessage-length=100 %s 2>&1 | FileCheck -strict-whitespace %s
 // REQUIRES: asserts
 
 int main() {
diff --git a/clang/test/OpenMP/depobj_codegen.cpp b/clang/test/OpenMP/depobj_codegen.cpp
index 2c7509babc17..e51c607ac55a 100644
--- a/clang/test/OpenMP/depobj_codegen.cpp
+++ b/clang/test/OpenMP/depobj_codegen.cpp
@@ -21,7 +21,7 @@ void foo() {}
 template <class T>
 T tmain(T argc) {
   static T a;
-  void *argv;
+  int *argv;
 #pragma omp depobj(a) depend(in:argv, ([3][*(int*)argv][4])argv)
 #pragma omp depobj(argc) destroy
 #pragma omp depobj(argc) update(inout)
@@ -99,12 +99,12 @@ int main(int argc, char **argv) {
 // CHECK: store i64 8, i64* [[SZ_ADDR]],
 // CHECK: [[FLAGS_ADDR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[BASE_ADDR]], i{{.+}} 0, i{{.+}} 2
 // CHECK: store i8 1, i8* [[FLAGS_ADDR]],
-// CHECK: [[SHAPE_ADDR:%.+]] = load i8*, i8** [[ARGV_ADDR:%.+]],
-// CHECK: [[SZ1:%.+]] = mul nuw i64 3, %{{.+}}
+// CHECK: [[SHAPE_ADDR:%.+]] = load i32*, i32** [[ARGV_ADDR:%.+]],
+// CHECK: [[SZ1:%.+]] = mul nuw i64 12, %{{.+}}
 // CHECK: [[SZ:%.+]] = mul nuw i64 [[SZ1]], 4
 // CHECK: [[BASE_ADDR:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[DEP_ADDR]], i{{.+}} 0, i{{.+}} 2
 // CHECK: [[ADDR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[BASE_ADDR]], i{{.+}} 0, i{{.+}} 0
-// CHECK: [[SHAPE:%.+]] = ptrtoint i8* [[SHAPE_ADDR]] to i64
+// CHECK: [[SHAPE:%.+]] = ptrtoint i32* [[SHAPE_ADDR]] to i64
 // CHECK: store i64 [[SHAPE]], i64* [[ADDR]],
 // CHECK: [[SZ_ADDR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[BASE_ADDR]], i{{.+}} 0, i{{.+}} 1
 // CHECK: store i64 [[SZ]], i64* [[SZ_ADDR]],
diff --git a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
index faff77e0a43b..c71615d2521f 100644
--- a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
+++ b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
@@ -1,5 +1,10 @@
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fexceptions -fcxx-exceptions
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fexceptions -fcxx-exceptions -ferror-limit 100
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown \
+// RUN:   -verify=host -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc \
+// RUN:   %s -o %t-ppc-host.bc -fexceptions -fcxx-exceptions
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown \
+// RUN:   -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s \
+// RUN:   -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - \
+// RUN:   -fexceptions -fcxx-exceptions -ferror-limit 100
 
 #ifndef HEADER
 #define HEADER
@@ -81,4 +86,17 @@ int (*B)() = &foobar2;
 int foobar1() { throw 1; }
 int foobar2() { throw 1; } // expected-error {{cannot use 'throw' with exceptions disabled}}
 
+
+int foobar3();
+int (*C)() = &foobar3; // expected-warning {{declaration is not declared in any declare target region}}
+                       // host-warning@-1 {{declaration is not declared in any declare target region}}
+#pragma omp declare target
+int (*D)() = C; // expected-note {{used here}}
+                // host-note@-1 {{used here}}
+#pragma omp end declare target
+int foobar3() { throw 1; }
+
+// Check no infinite recursion in deferred diagnostic emitter.
+long E = (long)&E;
+
 #endif // HEADER
diff --git a/clang/test/OpenMP/target_data_ast_print.cpp b/clang/test/OpenMP/target_data_ast_print.cpp
index fa67c1834aa4..fcd6e928655c 100644
--- a/clang/test/OpenMP/target_data_ast_print.cpp
+++ b/clang/test/OpenMP/target_data_ast_print.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -139,6 +139,8 @@ int main (int argc, char **argv) {
   static int a;
 // CHECK: static int a;
 
+#pragma omp target data map(to: ([argc][3][a])argv)
+  // CHECK: #pragma omp target data map(to: ([argc][3][a])argv)
 #pragma omp target data map(to: c)
 // CHECK:      #pragma omp target data map(to: c)
   a=2;
diff --git a/clang/test/OpenMP/target_map_codegen.cpp b/clang/test/OpenMP/target_map_codegen.cpp
index b9766e82ce03..ecfe50c01ea6 100644
--- a/clang/test/OpenMP/target_map_codegen.cpp
+++ b/clang/test/OpenMP/target_map_codegen.cpp
@@ -5353,5 +5353,81 @@ void explicit_maps_single (int ii){
 // CK31: define {{.+}}[[CALL00]]
 // CK31: define {{.+}}[[CALL01]]
 
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK32 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK32 --check-prefix CK32-64
+// RUN: %clang_cc1 -DCK32 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK32 --check-prefix CK32-64
+// RUN: %clang_cc1 -DCK32 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK32 --check-prefix CK32-32
+// RUN: %clang_cc1 -DCK32 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK32 --check-prefix CK32-32
+
+// RUN: %clang_cc1 -DCK32 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY32 %s
+// RUN: %clang_cc1 -DCK32 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY32 %s
+// RUN: %clang_cc1 -DCK32 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY32 %s
+// RUN: %clang_cc1 -DCK32 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY32 %s
+// SIMD-ONLY32-NOT: {{__kmpc|__tgt}}
+#ifdef CK32
+
+// CK32-DAG: [[MTYPE_TO:@.+]] = {{.+}}constant [1 x i64] [i64 33]
+// CK32-DAG: [[MTYPE_FROM:@.+]] = {{.+}}constant [1 x i64] [i64 34]
+
+void array_shaping(float *f, int sa) {
+
+  // CK32-DAG: call i32 @__tgt_target(i64 -1, i8* @{{.+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE_TO]]{{.+}})
+  // CK32-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK32-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK32-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK32-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK32-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK32-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK32-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float**
+  // CK32-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float**
+
+  // CK32-DAG: store float* [[F1:%.+]], float** [[BPC0]],
+  // CK32-DAG: store float* [[F2:%.+]], float** [[PC0]],
+  // CK32-DAG: store i64 [[SIZE:%.+]], i64* [[S0]],
+
+  // CK32-DAG: [[F1]] = load float*, float** [[F_ADDR:%.+]],
+  // CK32-DAG: [[F2]] = load float*, float** [[F_ADDR]],
+  // CK32-64-DAG: [[SIZE]] = mul nuw i64 [[SZ1:%.+]], 4
+  // CK32-64-DAG: [[SZ1]] = mul nuw i64 12, %{{.+}}
+  // CK32-32-DAG: [[SIZE]] = sext i32 [[SZ1:%.+]] to i64
+  // CK32-32-DAG: [[SZ1]] = mul nuw i32 [[SZ2:%.+]], 4
+  // CK32-32-DAG: [[SZ2]] = mul nuw i32 12, %{{.+}}
+  #pragma omp target map(to:([3][sa][4])f)
+  f[0] = 1;
+  sa = 1;
+  // CK32-DAG: call i32 @__tgt_target(i64 -1, i8* @{{.+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE_FROM]]{{.+}})
+  // CK32-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK32-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK32-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK32-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK32-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK32-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK32-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float**
+  // CK32-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float**
+
+  // CK32-DAG: store float* [[F1:%.+]], float** [[BPC0]],
+  // CK32-DAG: store float* [[F2:%.+]], float** [[PC0]],
+  // CK32-DAG: store i64 [[SIZE:%.+]], i64* [[S0]],
+
+  // CK32-DAG: [[F1]] = load float*, float** [[F_ADDR:%.+]],
+  // CK32-DAG: [[F2]] = load float*, float** [[F_ADDR]],
+  // CK32-64-DAG: [[SIZE]] = mul nuw i64 [[SZ1:%.+]], 5
+  // CK32-64-DAG: [[SZ1]] = mul nuw i64 4, %{{.+}}
+  // CK32-32-DAG: [[SIZE]] = sext i32 [[SZ1:%.+]] to i64
+  // CK32-32-DAG: [[SZ1]] = mul nuw i32 [[SZ2:%.+]], 5
+  // CK32-32-DAG: [[SZ2]] = mul nuw i32 4, %{{.+}}
+  #pragma omp target map(from: ([sa][5])f)
+  f[0] = 1;
+}
+
 #endif
 #endif
diff --git a/clang/test/OpenMP/target_map_messages.cpp b/clang/test/OpenMP/target_map_messages.cpp
index 96932af6a04c..a18590fc85fe 100644
--- a/clang/test/OpenMP/target_map_messages.cpp
+++ b/clang/test/OpenMP/target_map_messages.cpp
@@ -140,6 +140,8 @@ struct SA {
     {}
     #pragma omp target map(close bf: a)   // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
     {}
+    #pragma omp target map(([b[I]][bf])f)  // le45-error {{expected ',' or ']' in lambda capture list}} le45-error {{expected ')'}} le45-note {{to match this '('}}
+    {}
     return;
   }
 };
@@ -189,203 +191,209 @@ void SAclient(int arg) {
 
   SD u;
   SC r(p),t(p);
-  #pragma omp target map(r)
+#pragma omp target map(r)
   {}
-  #pragma omp target map(marr[2][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[2] [0:2] [0:2]) // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[:][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:] [0:2] [0:2]) // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[2][3][0:2])
+#pragma omp target map(marr[2][3] [0:2])
   {}
-  #pragma omp target map(marr[:][:][:])
+#pragma omp target map(marr[:][:][:])
   {}
-  #pragma omp target map(marr[:2][:][:])
+#pragma omp target map(marr[:2][:][:])
   {}
-  #pragma omp target map(marr[arg:][:][:])
+#pragma omp target map(marr [arg:][:][:])
   {}
-  #pragma omp target map(marr[arg:])
+#pragma omp target map(marr [arg:])
   {}
-  #pragma omp target map(marr[arg:][:arg][:]) // correct if arg is the size of dimension 2
+#pragma omp target map(marr [arg:][:arg][:]) // correct if arg is the size of dimension 2
   {}
-  #pragma omp target map(marr[:arg][:])
+#pragma omp target map(marr[:arg][:])
   {}
-  #pragma omp target map(marr[:arg][n:])
+#pragma omp target map(marr[:arg] [n:])
   {}
-  #pragma omp target map(marr[:][:arg][n:]) // correct if arg is the size of  dimension 2
+#pragma omp target map(marr[:][:arg] [n:]) // correct if arg is the size of  dimension 2
   {}
-  #pragma omp target map(marr[:][:m][n:]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:][:m] [n:])   // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[n:m][:arg][n:])
+#pragma omp target map(marr [n:m][:arg] [n:])
   {}
-  #pragma omp target map(marr[:2][:1][:]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:2][:1][:])  // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[:2][1:][:]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:2] [1:][:]) // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[:2][:][:1]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:2][:][:1])  // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[:2][:][1:]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:2][:] [1:]) // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[:1][:2][:])
+#pragma omp target map(marr[:1][:2][:])
   {}
-  #pragma omp target map(marr[:1][0][:])
+#pragma omp target map(marr[:1][0][:])
   {}
-  #pragma omp target map(marr[:arg][:2][:]) // correct if arg is 1
+#pragma omp target map(marr[:arg][:2][:]) // correct if arg is 1
   {}
-  #pragma omp target map(marr[:1][3:1][:2])
+#pragma omp target map(marr[:1] [3:1][:2])
   {}
-  #pragma omp target map(marr[:1][3:arg][:2]) // correct if arg is 1
+#pragma omp target map(marr[:1] [3:arg][:2]) // correct if arg is 1
   {}
-  #pragma omp target map(marr[:1][3:2][:2]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:1] [3:2][:2])   // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr[:2][:10][:])
+#pragma omp target map(marr[:2][:10][:])
   {}
-  #pragma omp target map(marr[:2][:][:5+5])
+#pragma omp target map(marr[:2][:][:5 + 5])
   {}
-  #pragma omp target map(marr[:2][2+2-4:][0:5+5])
+#pragma omp target map(marr[:2] [2 + 2 - 4:] [0:5 + 5])
   {}
 
-  #pragma omp target map(marr[:1][:2][0]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(marr[:1][:2][0]) // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(marr2[:1][:2][0])
+#pragma omp target map(marr2[:1][:2][0])
   {}
 
-  #pragma omp target map(mvla[:1][:][0]) // correct if the size of dimension 2 is 1.
+#pragma omp target map(mvla[:1][:][0])    // correct if the size of dimension 2 is 1.
   {}
-  #pragma omp target map(mvla[:2][:arg][:]) // correct if arg is the size of dimension 2.
+#pragma omp target map(mvla[:2][:arg][:]) // correct if arg is the size of dimension 2.
   {}
-  #pragma omp target map(mvla[:1][:2][0]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(mvla[:1][:2][0])   // expected-error {{array section does not specify contiguous storage}}
    {}
-  #pragma omp target map(mvla[1][2:arg][:])
+#pragma omp target map(mvla[1] [2:arg][:])
   {}
-  #pragma omp target map(mvla[:1][:][:])
+#pragma omp target map(mvla[:1][:][:])
   {}
-  #pragma omp target map(mvla2[:1][:2][:11])
+#pragma omp target map(mvla2[:1][:2][:11])
   {}
-  #pragma omp target map(mvla2[:1][:2][:10]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(mvla2[:1][:2][:10]) // expected-error {{array section does not specify contiguous storage}}
   {}
 
-  #pragma omp target map(mptr[:2][2+2-4:1][0:5+5]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(mptr[:2] [2 + 2 - 4:1] [0:5 + 5]) // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(mptr[:1][:2-1][2:4-3])
+#pragma omp target map(mptr[:1][:2 - 1] [2:4 - 3])
   {}
-  #pragma omp target map(mptr[:1][:arg][2:4-3]) // correct if arg is 1.
+#pragma omp target map(mptr[:1][:arg] [2:4 - 3]) // correct if arg is 1.
   {}
-  #pragma omp target map(mptr[:1][:2-1][0:2])
+#pragma omp target map(mptr[:1][:2 - 1] [0:2])
   {}
-  #pragma omp target map(mptr[:1][:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(mptr[:1][:2] [0:2]) // expected-error {{array section does not specify contiguous storage}}
   {}
-  #pragma omp target map(mptr[:1][:][0:2]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target map(mptr[:1][:] [0:2])  // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
   {}
-  #pragma omp target map(mptr[:2][:1][0:2]) // expected-error {{array section does not specify contiguous storage}}
+#pragma omp target map(mptr[:2][:1] [0:2]) // expected-error {{array section does not specify contiguous storage}}
   {}
 
-  #pragma omp target map(r.ArrS[0].B)
+#pragma omp target map(r.ArrS[0].B)
   {}
-  #pragma omp target map(r.ArrS[:1].B) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target map(r.ArrS[:1].B)   // expected-error {{OpenMP array section is not allowed here}}
   {}
-  #pragma omp target map(r.ArrS[:arg].B) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target map(r.ArrS[:arg].B) // expected-error {{OpenMP array section is not allowed here}}
   {}
-  #pragma omp target map(r.ArrS[0].Arr[1:23])
+#pragma omp target map(r.ArrS[0].Arr [1:23])
   {}
-  #pragma omp target map(r.ArrS[0].Arr[1:arg])
+#pragma omp target map(r.ArrS[0].Arr [1:arg])
   {}
-  #pragma omp target map(r.ArrS[0].Arr[arg:23])
+#pragma omp target map(r.ArrS[0].Arr [arg:23])
   {}
-  #pragma omp target map(r.ArrS[0].Error) // expected-error {{no member named 'Error' in 'SB'}}
+#pragma omp target map(r.ArrS[0].Error)          // expected-error {{no member named 'Error' in 'SB'}}
   {}
-  #pragma omp target map(r.ArrS[0].A, r.ArrS[1].A) // expected-error {{multiple array elements associated with the same variable are not allowed in map clauses of the same construct}} expected-note {{used here}}
+#pragma omp target map(r.ArrS[0].A, r.ArrS[1].A) // expected-error {{multiple array elements associated with the same variable are not allowed in map clauses of the same construct}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.ArrS[0].A, t.ArrS[1].A)
+#pragma omp target map(r.ArrS[0].A, t.ArrS[1].A)
   {}
-  #pragma omp target map(r.PtrS[0], r.PtrS->B) // expected-error {{same pointer dereferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
+#pragma omp target map(r.PtrS[0], r.PtrS->B) // expected-error {{same pointer dereferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.PtrS, r.PtrS->B) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+#pragma omp target map(r.PtrS, r.PtrS->B)    // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.PtrS->A, r.PtrS->B)
+#pragma omp target map(r.PtrS->A, r.PtrS->B)
   {}
-  #pragma omp target map(r.RPtrS[0], r.RPtrS->B) // expected-error {{same pointer dereferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
+#pragma omp target map(r.RPtrS[0], r.RPtrS->B) // expected-error {{same pointer dereferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.RPtrS, r.RPtrS->B) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+#pragma omp target map(r.RPtrS, r.RPtrS->B)    // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.RPtrS->A, r.RPtrS->B)
+#pragma omp target map(r.RPtrS->A, r.RPtrS->B)
   {}
-  #pragma omp target map(r.S.Arr[:12])
+#pragma omp target map(r.S.Arr[:12])
   {}
-  #pragma omp target map(r.S.foo()[:12]) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le50-error {{expected addressable lvalue in 'map' clause}}
+#pragma omp target map(r.S.foo() [:12]) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le50-error {{expected addressable lvalue in 'map' clause}}
   {}
-  #pragma omp target map(r.C, r.D)
+#pragma omp target map(r.C, r.D)
   {}
-  #pragma omp target map(r.C, r.C) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+#pragma omp target map(r.C, r.C)     // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.C) map(r.C) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+#pragma omp target map(r.C) map(r.C) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.C, r.S)  // this would be an error only caught at runtime - Sema would have to make sure there is not way for the missing data between fields to be mapped somewhere else.
+#pragma omp target map(r.C, r.S)     // this would be an error only caught at runtime - Sema would have to make sure there is not way for the missing data between fields to be mapped somewhere else.
   {}
-  #pragma omp target map(r, r.S)  // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+#pragma omp target map(r, r.S)       // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
   {}
-  #pragma omp target map(r.C, t.C)
+#pragma omp target map(r.C, t.C)
   {}
-  #pragma omp target map(r.A)   // expected-error {{bit fields cannot be used to specify storage in a 'map' clause}}
+#pragma omp target map(r.A) // expected-error {{bit fields cannot be used to specify storage in a 'map' clause}}
   {}
-  #pragma omp target map(r.Arr)
+#pragma omp target map(r.Arr)
   {}
-  #pragma omp target map(r.Arr[3:5])
+#pragma omp target map(r.Arr [3:5])
   {}
-  #pragma omp target map(r.Ptr[3:5])
+#pragma omp target map(r.Ptr [3:5])
   {}
-  #pragma omp target map(r.ArrS[3:5].A)   // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target map(r.ArrS [3:5].A)         // expected-error {{OpenMP array section is not allowed here}}
   {}
-  #pragma omp target map(r.ArrS[3:5].Arr[6:7])   // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target map(r.ArrS [3:5].Arr [6:7]) // expected-error {{OpenMP array section is not allowed here}}
   {}
-  #pragma omp target map(r.ArrS[3].Arr[6:7])
+#pragma omp target map(r.ArrS[3].Arr [6:7])
   {}
-  #pragma omp target map(r.S.Arr[4:5])
+#pragma omp target map(r.S.Arr [4:5])
   {}
-  #pragma omp target map(r.S.Ptr[4:5])
+#pragma omp target map(r.S.Ptr [4:5])
   {}
-  #pragma omp target map(r.S.Ptr[:])  // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target map(r.S.Ptr[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
   {}
-  #pragma omp target map((p+1)->A)  // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target map((p + 1)->A) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}}
   {}
-  #pragma omp target map(u.B)  // expected-error {{mapping of union members is not allowed}}
+#pragma omp target map(u.B)        // expected-error {{mapping of union members is not allowed}}
   {}
-  #pragma omp target
+#pragma omp target
   {
     u.B = 0;
     r.S.foo();
   }
 
-  #pragma omp target data map(to: r.C) //expected-note {{used here}}
+#pragma omp target data map(to \
+                            : r.C) //expected-note {{used here}}
   {
-    #pragma omp target map(r.D)  // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+#pragma omp target map(r.D)        // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
     {}
   }
 
-  #pragma omp target data map(to: t.Ptr) //expected-note {{used here}}
+#pragma omp target data map(to \
+                            : t.Ptr) //expected-note {{used here}}
   {
-    #pragma omp target map(t.Ptr[:23])  // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target map(t.Ptr[:23])   // expected-error {{pointer cannot be mapped along with a section derived from itself}}
     {}
   }
 
-  #pragma omp target data map(to: t.C, t.D)
+#pragma omp target data map(to \
+                            : t.C, t.D)
   {
-  #pragma omp target data map(to: t.C)
+#pragma omp target data map(to \
+                            : t.C)
   {
-    #pragma omp target map(t.D)
+#pragma omp target map(t.D)
     {}
   }
   }
-  #pragma omp target data map(marr[:][:][:])
+#pragma omp target data map(marr[:][:][:])
   {
-    #pragma omp target data map(marr)
+#pragma omp target data map(marr)
     {}
   }
 
-  #pragma omp target data map(to: t)
+#pragma omp target data map(to \
+                            : t)
   {
-  #pragma omp target data map(to: t.C)
+#pragma omp target data map(to \
+                            : t.C)
   {
-    #pragma omp target map(t.D)
+#pragma omp target map(t.D)
     {}
   }
   }
diff --git a/clang/test/OpenMP/target_update_ast_print.cpp b/clang/test/OpenMP/target_update_ast_print.cpp
index e60e081b3210..fb6440b87cea 100644
--- a/clang/test/OpenMP/target_update_ast_print.cpp
+++ b/clang/test/OpenMP/target_update_ast_print.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -14,29 +14,29 @@ void foo() {}
 
 template <class T, class U>
 T foo(T targ, U uarg) {
-  static T a;
+  static T a, *p;
   U b;
   int l;
-#pragma omp target update to(a) if(l>5) device(l) nowait depend(inout:l)
+#pragma omp target update to(([a][targ])p, a) if(l>5) device(l) nowait depend(inout:l)
 
-#pragma omp target update from(b) if(l<5) device(l-1) nowait depend(inout:l)
+#pragma omp target update from(b, ([a][targ])p) if(l<5) device(l-1) nowait depend(inout:l)
   return a + targ + (T)b;
 }
-// CHECK:      static T a;
+// CHECK:      static T a, *p;
 // CHECK-NEXT: U b;
 // CHECK-NEXT: int l;
-// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l){{$}}
-// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
-// CHECK:      static int a;
+// CHECK-NEXT: #pragma omp target update to(([a][targ])p,a) if(l > 5) device(l) nowait depend(inout : l){{$}}
+// CHECK-NEXT: #pragma omp target update from(b,([a][targ])p) if(l < 5) device(l - 1) nowait depend(inout : l)
+// CHECK:      static int a, *p;
 // CHECK-NEXT: float b;
 // CHECK-NEXT: int l;
-// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
-// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
-// CHECK:      static char a;
+// CHECK-NEXT: #pragma omp target update to(([a][targ])p,a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b,([a][targ])p) if(l < 5) device(l - 1) nowait depend(inout : l)
+// CHECK:      static char a, *p;
 // CHECK-NEXT: float b;
 // CHECK-NEXT: int l;
-// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
-// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update to(([a][targ])p,a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b,([a][targ])p) if(l < 5) device(l - 1) nowait depend(inout : l)
 
 int main(int argc, char **argv) {
   static int a;
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index 479461e7ca80..fd5a62a8067c 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -984,5 +984,80 @@ void lvalue_find_base(float **f, SSA *sa) {
   #pragma omp target update from(*(sa->sa->i+*(1+sa->i+f)))
 }
 
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK18 --check-prefix CK18-64
+// RUN: %clang_cc1 -DCK18 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-64
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
+// RUN: %clang_cc1 -DCK18 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
+
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -DCK18 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -DCK18 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY18 %s
+// SIMD-ONLY18-NOT: {{__kmpc|__tgt}}
+#ifdef CK18
+
+// CK18-DAG: [[MTYPE_TO:@.+]] = {{.+}}constant [1 x i64] [i64 33]
+// CK18-DAG: [[MTYPE_FROM:@.+]] = {{.+}}constant [1 x i64] [i64 34]
+
+//CK18-LABEL: array_shaping
+void array_shaping(float *f, int sa) {
+
+  // CK18-DAG: call void @__tgt_target_data_update(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE_TO]]{{.+}})
+  // CK18-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK18-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK18-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK18-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK18-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK18-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK18-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float**
+  // CK18-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float**
+
+  // CK18-DAG: store float* [[F1:%.+]], float** [[BPC0]],
+  // CK18-DAG: store float* [[F2:%.+]], float** [[PC0]],
+  // CK18-DAG: store i64 [[SIZE:%.+]], i64* [[S0]],
+
+  // CK18-DAG: [[F1]] = load float*, float** [[F_ADDR:%.+]],
+  // CK18-DAG: [[F2]] = load float*, float** [[F_ADDR]],
+  // CK18-64-DAG: [[SIZE]] = mul nuw i64 [[SZ1:%.+]], 4
+  // CK18-64-DAG: [[SZ1]] = mul nuw i64 12, %{{.+}}
+  // CK18-32-DAG: [[SIZE]] = sext i32 [[SZ1:%.+]] to i64
+  // CK18-32-DAG: [[SZ1]] = mul nuw i32 [[SZ2:%.+]], 4
+  // CK18-32-DAG: [[SZ2]] = mul nuw i32 12, %{{.+}}
+  #pragma omp target update to(([3][sa][4])f)
+  sa = 1;
+  // CK18-DAG: call void @__tgt_target_data_update(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE_FROM]]{{.+}})
+  // CK18-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK18-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK18-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK18-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK18-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK18-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK18-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float**
+  // CK18-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float**
+
+  // CK18-DAG: store float* [[F1:%.+]], float** [[BPC0]],
+  // CK18-DAG: store float* [[F2:%.+]], float** [[PC0]],
+  // CK18-DAG: store i64 [[SIZE:%.+]], i64* [[S0]],
+
+  // CK18-DAG: [[F1]] = load float*, float** [[F_ADDR:%.+]],
+  // CK18-DAG: [[F2]] = load float*, float** [[F_ADDR]],
+  // CK18-64-DAG: [[SIZE]] = mul nuw i64 [[SZ1:%.+]], 5
+  // CK18-64-DAG: [[SZ1]] = mul nuw i64 4, %{{.+}}
+  // CK18-32-DAG: [[SIZE]] = sext i32 [[SZ1:%.+]] to i64
+  // CK18-32-DAG: [[SZ1]] = mul nuw i32 [[SZ2:%.+]], 5
+  // CK18-32-DAG: [[SZ2]] = mul nuw i32 4, %{{.+}}
+  #pragma omp target update from(([sa][5])f)
+}
+
 #endif
 #endif
diff --git a/clang/test/OpenMP/task_ast_print.cpp b/clang/test/OpenMP/task_ast_print.cpp
index 0f11b390f7fa..1da6c5045934 100644
--- a/clang/test/OpenMP/task_ast_print.cpp
+++ b/clang/test/OpenMP/task_ast_print.cpp
@@ -164,8 +164,8 @@ int main(int argc, char **argv) {
 #pragma omp threadprivate(a)
   Enum ee;
 // CHECK: Enum ee;
-#pragma omp task untied mergeable depend(out:argv[:a][1], (arr)[0:],([argc][10])argv) if(task: argc > 0) priority(f) depend(depobj:y)
-  // CHECK-NEXT: #pragma omp task untied mergeable depend(out : argv[:a][1],(arr)[0:],([argc][10])argv) if(task: argc > 0) priority(f) depend(depobj : y)
+#pragma omp task untied mergeable depend(out:argv[:a][1], (arr)[0:],([argc][10])argv,b) if(task: argc > 0) priority(f) depend(depobj:y)
+  // CHECK-NEXT: #pragma omp task untied mergeable depend(out : argv[:a][1],(arr)[0:],([argc][10])argv,b) if(task: argc > 0) priority(f) depend(depobj : y)
   a = 2;
 // CHECK-NEXT: a = 2;
 #pragma omp taskgroup task_reduction(min: arr1)
diff --git a/clang/test/OpenMP/task_codegen.c b/clang/test/OpenMP/task_codegen.c
index 9e4b3b59d6d5..0f01f11be8b3 100644
--- a/clang/test/OpenMP/task_codegen.c
+++ b/clang/test/OpenMP/task_codegen.c
@@ -58,7 +58,7 @@ int main() {
   // CHECK: store i8 1, i8* [[FLAGS_ADDR]],
   // CHECK: [[A:%.+]] = load i32, i32* [[A_ADDR]],
   // CHECK: [[A_CAST:%.+]] = sext i32 [[A]] to i64
-  // CHECK: [[SZ1:%.+]] = mul nuw i64 3, [[A_CAST]]
+  // CHECK: [[SZ1:%.+]] = mul nuw i64 24, [[A_CAST]]
   // CHECK: [[A:%.+]] = load i32, i32* [[A_ADDR]],
   // CHECK: [[A_CAST:%.+]] = sext i32 [[A]] to i64
   // CHECK: [[SZ:%.+]] = mul nuw i64 [[SZ1]], [[A_CAST]]
diff --git a/clang/test/OpenMP/task_depend_messages.cpp b/clang/test/OpenMP/task_depend_messages.cpp
index 7d976eca2ec1..f04c167cbdcc 100644
--- a/clang/test/OpenMP/task_depend_messages.cpp
+++ b/clang/test/OpenMP/task_depend_messages.cpp
@@ -67,8 +67,8 @@ int main(int argc, char **argv, char *env[]) {
   #pragma omp task depend(in : ([]) // omp45-error {{expected body of lambda expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp50-error 2 {{expected expression}}
   #pragma omp task depend(in : ([])a // omp45-error {{expected body of lambda expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp50-error {{expected expression}}
   #pragma omp task depend(in : ([])a) // omp45-error {{expected body of lambda expression}} omp50-error {{expected expression}}
-  #pragma omp task depend(in : ([a])a) // omp45-error {{expected body of lambda expression}} omp50-error {{expected pointer type expression as a base of an array shaping operation}}
-  #pragma omp task depend(in : ([a])argc) // omp45-error {{expected body of lambda expression}} omp50-error {{expected pointer type expression as a base of an array shaping operation}}
+  #pragma omp task depend(in : ([a])a) // omp45-error {{expected body of lambda expression}} omp50-error {{expected expression with a pointer to a complete type as a base of an array shaping operation}}
+  #pragma omp task depend(in : ([a])argc) // omp45-error {{expected body of lambda expression}} omp50-error {{expected expression with a pointer to a complete type as a base of an array shaping operation}}
   #pragma omp task depend(in : ([-1][0])argv) // omp45-error {{expected variable name or 'this' in lambda capture list}} omp45-error {{expected ')'}} omp45-note {{to match this '('}} omp50-error {{array shaping dimension is evaluated to a non-positive value -1}} omp50-error {{array shaping dimension is evaluated to a non-positive value 0}}
   foo();
 
diff --git a/clang/test/Sema/parentheses.c b/clang/test/Sema/parentheses.c
index 047bcbfe6caf..164fe4c0f12d 100644
--- a/clang/test/Sema/parentheses.c
+++ b/clang/test/Sema/parentheses.c
@@ -117,5 +117,5 @@ void conditional_op(int x, int y, _Bool b, void* p) {
   (void)(x && b ? 1 : 2);  // no warning, logical operator
 }
 
-// RUN: not %clang_cc1 -fsyntax-only -Wparentheses -Werror -fdiagnostics-show-option %s 2>&1 | FileCheck %s -check-prefix=CHECK-FLAG
+// RUN: not %clang_cc1 -fsyntax-only -Wparentheses -Werror %s 2>&1 | FileCheck %s -check-prefix=CHECK-FLAG
 // CHECK-FLAG: error: using the result of an assignment as a condition without parentheses [-Werror,-Wparentheses]
diff --git a/clang/test/SemaCXX/atomic-type.cpp b/clang/test/SemaCXX/atomic-type.cpp
index 1ed321e47b9a..d7d8bbba50ca 100644
--- a/clang/test/SemaCXX/atomic-type.cpp
+++ b/clang/test/SemaCXX/atomic-type.cpp
@@ -103,3 +103,11 @@ namespace copy_init {
 bool PR21836(_Atomic(int) *x) { // expected-warning {{'_Atomic' is a C11 extension}}
     return *x;
 }
+
+namespace non_trivially_copyable {
+  struct S {
+    ~S() {}
+  };
+  _Atomic S s;  // expected-error {{_Atomic cannot be applied to type 'non_trivially_copyable::S' which is not trivially copyable}} \
+                // expected-warning {{'_Atomic' is a C11 extension}}
+}
diff --git a/clang/unittests/Analysis/CloneDetectionTest.cpp b/clang/unittests/Analysis/CloneDetectionTest.cpp
index e09d0733f044..f8f3602f5a2a 100644
--- a/clang/unittests/Analysis/CloneDetectionTest.cpp
+++ b/clang/unittests/Analysis/CloneDetectionTest.cpp
@@ -42,7 +42,7 @@ class NoBarFunctionConstraint {
           for (const StmtSequence &Arg : {A, B}) {
             if (const auto *D =
                     dyn_cast<const FunctionDecl>(Arg.getContainingDecl())) {
-              if (StringRef(D->getNameAsString()).startswith("bar"))
+              if (D->getName().startswith("bar"))
                 return false;
             }
           }
diff --git a/compiler-rt/lib/profile/GCDAProfiling.c b/compiler-rt/lib/profile/GCDAProfiling.c
index 498c05900bf2..5ff1e9cd8070 100644
--- a/compiler-rt/lib/profile/GCDAProfiling.c
+++ b/compiler-rt/lib/profile/GCDAProfiling.c
@@ -348,20 +348,29 @@ void llvm_gcda_start_file(const char *orig_filename, const char version[4],
   fd = open(filename, O_RDWR | O_BINARY);
 
   if (fd == -1) {
-    /* Try opening the file, creating it if necessary. */
-    new_file = 1;
-    mode = "w+b";
-    fd = open(filename, O_RDWR | O_CREAT | O_BINARY, 0644);
-    if (fd == -1) {
+    /* Try creating the file. */
+    fd = open(filename, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0644);
+    if (fd != -1) {
+      new_file = 1;
+      mode = "w+b";
+    } else {
       /* Try creating the directories first then opening the file. */
       __llvm_profile_recursive_mkdir(filename);
-      fd = open(filename, O_RDWR | O_CREAT | O_BINARY, 0644);
-      if (fd == -1) {
-        /* Bah! It's hopeless. */
-        int errnum = errno;
-        fprintf(stderr, "profiling: %s: cannot open: %s\n", filename,
-                strerror(errnum));
-        return;
+      fd = open(filename, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0644);
+      if (fd != -1) {
+        new_file = 1;
+        mode = "w+b";
+      } else {
+        /* Another process may have created the file just now.
+         * Try opening it without O_CREAT and O_EXCL. */
+        fd = open(filename, O_RDWR | O_BINARY);
+        if (fd == -1) {
+          /* Bah! It's hopeless. */
+          int errnum = errno;
+          fprintf(stderr, "profiling: %s: cannot open: %s\n", filename,
+                  strerror(errnum));
+          return;
+        }
       }
     }
   }
diff --git a/compiler-rt/lib/tsan/go/build.bat b/compiler-rt/lib/tsan/go/build.bat
index bf502873b113..0755688e5bd3 100644
--- a/compiler-rt/lib/tsan/go/build.bat
+++ b/compiler-rt/lib/tsan/go/build.bat
@@ -59,4 +59,4 @@ gcc ^
   -DSANITIZER_DEBUG=0 ^
   -O3 ^
   -fomit-frame-pointer ^
-  -std=c++11
+  -std=c++14
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-parallel.driver.c b/compiler-rt/test/profile/Inputs/instrprof-gcov-parallel.driver.c
new file mode 100644
index 000000000000..6ce12d35772f
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-parallel.driver.c
@@ -0,0 +1,36 @@
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define CHILDREN 7
+
+int main(int argc, char *argv[]) {
+  _Atomic int *sync = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
+                           MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+  if (sync == MAP_FAILED)
+    return 1;
+  *sync = 0;
+
+  for (int i = 0; i < CHILDREN; i++) {
+    pid_t pid = fork();
+    if (!pid) {
+      // child
+      while (*sync == 0)
+        ; // wait the parent in order to call execl simultaneously
+      execl(argv[1], argv[1], NULL);
+    } else if (pid == -1) {
+      *sync = 1; // release all children
+      return 1;
+    }
+  }
+
+  // parent
+  *sync = 1; // start the program in all children simultaneously
+  for (int i = 0; i < CHILDREN; i++)
+    wait(NULL);
+
+  return 0;
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-parallel.target.c b/compiler-rt/test/profile/Inputs/instrprof-gcov-parallel.target.c
new file mode 100644
index 000000000000..ae6e60fb2190
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-parallel.target.c
@@ -0,0 +1,9 @@
+#define COUNT 101
+
+static volatile int aaa;
+
+int main(int argc, char *argv[]) {
+  for (int i = 0; i < COUNT; i++)
+    aaa++;
+  return 0;
+}
diff --git a/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test b/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test
new file mode 100644
index 000000000000..0c7198e3c4e9
--- /dev/null
+++ b/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test
@@ -0,0 +1,16 @@
+RUN: mkdir -p %t.d
+RUN: cd %t.d
+
+RUN: %clang -o %t.driver %S/../Inputs/instrprof-gcov-parallel.driver.c
+RUN: %clang --coverage -o %t.target %S/../Inputs/instrprof-gcov-parallel.target.c
+RUN: test -f instrprof-gcov-parallel.target.gcno
+
+RUN: rm -f instrprof-gcov-parallel.target.gcda
+RUN: %run %t.driver %t.target
+RUN: llvm-cov gcov instrprof-gcov-parallel.target.gcda
+RUN: FileCheck --input-file instrprof-gcov-parallel.target.c.gcov %s
+
+# Test if the .gcda file is correctly created from one of child processes
+# and counters of all processes are recorded correctly.
+# 707 = CHILDREN * COUNT
+CHECK: 707: {{[0-9]+}}: aaa++;
diff --git a/libc/src/signal/linux/CMakeLists.txt b/libc/src/signal/linux/CMakeLists.txt
index a3017e406ee1..7992703a42d2 100644
--- a/libc/src/signal/linux/CMakeLists.txt
+++ b/libc/src/signal/linux/CMakeLists.txt
@@ -94,3 +94,29 @@ add_entrypoint_object(
     sigaction
     signal_h
 )
+
+add_entrypoint_object(
+  sigfillset
+  SRCS
+    sigfillset.cpp
+  HDRS
+    signal.h
+    ../sigfillset.h
+  DEPENDS
+    __errno_location
+    errno_h
+    signal_h
+)
+
+add_entrypoint_object(
+  sigdelset
+  SRCS
+    sigdelset.cpp
+  HDRS
+    signal.h
+    ../sigdelset.h
+  DEPENDS
+    __errno_location
+    errno_h
+    signal_h
+)
diff --git a/libc/src/signal/linux/sigdelset.cpp b/libc/src/signal/linux/sigdelset.cpp
new file mode 100644
index 000000000000..b04ec56cc33e
--- /dev/null
+++ b/libc/src/signal/linux/sigdelset.cpp
@@ -0,0 +1,28 @@
+//===----------------- Linux implementation of sigdelset ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/signal/sigdelset.h"
+#include "include/errno.h"
+#include "src/errno/llvmlibc_errno.h"
+#include "src/signal/linux/signal.h"
+
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+int LLVM_LIBC_ENTRYPOINT(sigdelset)(sigset_t *set, int signum) {
+  if (!set || (unsigned)(signum - 1) >= (8 * sizeof(sigset_t))) {
+    llvmlibc_errno = EINVAL;
+    return -1;
+  }
+  auto *sigset = reinterpret_cast<__llvm_libc::Sigset *>(set);
+  sigset->delset(signum);
+  return 0;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/signal/linux/sigfillset.cpp b/libc/src/signal/linux/sigfillset.cpp
new file mode 100644
index 000000000000..6c10da334993
--- /dev/null
+++ b/libc/src/signal/linux/sigfillset.cpp
@@ -0,0 +1,28 @@
+//===----------------- Linux implementation of sigfillset -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/signal/sigfillset.h"
+#include "include/errno.h"
+#include "src/errno/llvmlibc_errno.h"
+#include "src/signal/linux/signal.h"
+
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+int LLVM_LIBC_ENTRYPOINT(sigfillset)(sigset_t *set) {
+  if (!set) {
+    llvmlibc_errno = EINVAL;
+    return -1;
+  }
+  auto *sigset = reinterpret_cast<__llvm_libc::Sigset *>(set);
+  *sigset = __llvm_libc::Sigset::fullset();
+  return 0;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/signal/linux/signal.h b/libc/src/signal/linux/signal.h
index 93b33596580f..9a02ea7d847d 100644
--- a/libc/src/signal/linux/signal.h
+++ b/libc/src/signal/linux/signal.h
@@ -26,9 +26,9 @@ struct Sigset {
   constexpr static Sigset fullset() { return {-1UL}; }
   constexpr static Sigset emptySet() { return {0}; }
 
-  constexpr void addset(int signal) {
-    nativeSigset |= (1L << (signal - 1));
-  }
+  constexpr void addset(int signal) { nativeSigset |= (1L << (signal - 1)); }
+
+  constexpr void delset(int signal) { nativeSigset &= ~(1L << (signal - 1)); }
 
   operator sigset_t() const { return nativeSigset; }
 };
@@ -39,16 +39,15 @@ static inline int block_all_signals(Sigset &set) {
   sigset_t nativeSigset = all;
   sigset_t oldSet = set;
   int ret = __llvm_libc::syscall(SYS_rt_sigprocmask, SIG_BLOCK, &nativeSigset,
-                              &oldSet, sizeof(sigset_t));
+                                 &oldSet, sizeof(sigset_t));
   set = {oldSet};
   return ret;
 }
 
 static inline int restore_signals(const Sigset &set) {
   sigset_t nativeSigset = set;
-  return __llvm_libc::syscall(SYS_rt_sigprocmask, SIG_SETMASK,
-                              &nativeSigset, nullptr,
-                              sizeof(sigset_t));
+  return __llvm_libc::syscall(SYS_rt_sigprocmask, SIG_SETMASK, &nativeSigset,
+                              nullptr, sizeof(sigset_t));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/signal/sigdelset.h b/libc/src/signal/sigdelset.h
new file mode 100644
index 000000000000..05cc47ce4b9c
--- /dev/null
+++ b/libc/src/signal/sigdelset.h
@@ -0,0 +1,20 @@
+//===------------- Implementation header for sigdelset ---------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SIGNAL_SIGDELSET_H
+#define LLVM_LIBC_SRC_SIGNAL_SIGDELSET_H
+
+#include "include/signal.h"
+
+namespace __llvm_libc {
+
+int sigdelset(sigset_t *set, int signum);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SIGNAL_SIGDELSET_H
diff --git a/libc/src/signal/sigfillset.h b/libc/src/signal/sigfillset.h
new file mode 100644
index 000000000000..facf67919a9f
--- /dev/null
+++ b/libc/src/signal/sigfillset.h
@@ -0,0 +1,20 @@
+//===------------- Implementation header for sigfillset --------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SIGNAL_SIGFILLSET_H
+#define LLVM_LIBC_SRC_SIGNAL_SIGFILLSET_H
+
+#include "include/signal.h"
+
+namespace __llvm_libc {
+
+int sigfillset(sigset_t *set);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SIGNAL_SIGFILLSET_H
diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt
index e198e416c8e6..67219c85227a 100644
--- a/libc/test/src/signal/CMakeLists.txt
+++ b/libc/test/src/signal/CMakeLists.txt
@@ -66,3 +66,34 @@ add_libc_unittest(
     __errno_location
     errno_h
 )
+
+add_libc_unittest(
+  sigfillset_test
+  SUITE
+    libc_signal_unittests
+  SRCS
+    sigfillset_test.cpp
+  DEPENDS
+    sigfillset
+    sigprocmask
+    signal_h
+    raise
+    errno_h
+    __errno_location
+)
+
+add_libc_unittest(
+  sigdelset_test
+  SUITE
+    libc_signal_unittests
+  SRCS
+    sigdelset_test.cpp
+  DEPENDS
+    sigdelset
+    sigfillset
+    sigprocmask
+    signal_h
+    raise
+    errno_h
+    __errno_location
+)
diff --git a/libc/test/src/signal/sigdelset_test.cpp b/libc/test/src/signal/sigdelset_test.cpp
new file mode 100644
index 000000000000..d6e259ca4ca0
--- /dev/null
+++ b/libc/test/src/signal/sigdelset_test.cpp
@@ -0,0 +1,36 @@
+//===--------------------- Unittests for sigdelset ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/errno.h"
+#include "include/signal.h"
+#include "src/signal/raise.h"
+#include "src/signal/sigdelset.h"
+#include "src/signal/sigfillset.h"
+#include "src/signal/sigprocmask.h"
+
+#include "utils/UnitTest/ErrnoSetterMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+TEST(Sigdelset, Invalid) {
+  using __llvm_libc::testing::ErrnoSetterMatcher::Fails;
+  // Invalid set.
+  EXPECT_THAT(__llvm_libc::sigdelset(nullptr, SIGUSR1), Fails(EINVAL));
+
+  sigset_t set;
+  // Valid set, invalid signum.
+  EXPECT_THAT(__llvm_libc::sigdelset(&set, -1), Fails(EINVAL));
+}
+
+TEST(Sigdelset, UnblockOne) {
+  using __llvm_libc::testing::ErrnoSetterMatcher::Succeeds;
+  sigset_t set;
+  EXPECT_THAT(__llvm_libc::sigfillset(&set), Succeeds());
+  EXPECT_THAT(__llvm_libc::sigdelset(&set, SIGUSR1), Succeeds());
+  EXPECT_THAT(__llvm_libc::sigprocmask(SIG_SETMASK, &set, nullptr), Succeeds());
+  EXPECT_DEATH([] { __llvm_libc::raise(SIGUSR1); }, SIGUSR1);
+}
diff --git a/libc/test/src/signal/sigfillset_test.cpp b/libc/test/src/signal/sigfillset_test.cpp
new file mode 100644
index 000000000000..35e6721ab6c6
--- /dev/null
+++ b/libc/test/src/signal/sigfillset_test.cpp
@@ -0,0 +1,29 @@
+//===-------------------- Unittests for sigfillset ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/errno.h"
+#include "include/signal.h"
+#include "src/signal/raise.h"
+#include "src/signal/sigfillset.h"
+#include "src/signal/sigprocmask.h"
+
+#include "utils/UnitTest/ErrnoSetterMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+TEST(Sigfillset, Invalid) {
+  using __llvm_libc::testing::ErrnoSetterMatcher::Fails;
+  EXPECT_THAT(__llvm_libc::sigfillset(nullptr), Fails(EINVAL));
+}
+
+TEST(Sigfillset, BlocksAll) {
+  using __llvm_libc::testing::ErrnoSetterMatcher::Succeeds;
+  sigset_t set;
+  EXPECT_THAT(__llvm_libc::sigfillset(&set), Succeeds());
+  EXPECT_THAT(__llvm_libc::sigprocmask(SIG_SETMASK, &set, nullptr), Succeeds());
+  EXPECT_EXITS([] { __llvm_libc::raise(SIGUSR1); }, 0);
+}
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index 8fdf4a4939d1..15e9df2ea269 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -852,18 +852,6 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_integral_v
 
 // is_floating_point
 
-#if __has_keyword(__is_floating_point)
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_floating_point : _BoolConstant<__is_floating_point(_Tp)> { };
-
-#if _LIBCPP_STD_VER > 14
-template <class _Tp>
-_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_floating_point_v = __is_floating_point(_Tp);
-#endif
-
-#else
-
 template <class _Tp> struct __libcpp_is_floating_point              : public false_type {};
 template <>          struct __libcpp_is_floating_point<float>       : public true_type {};
 template <>          struct __libcpp_is_floating_point<double>      : public true_type {};
@@ -878,8 +866,6 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_floating_point_v
     = is_floating_point<_Tp>::value;
 #endif
 
-#endif // __has_keyword(__is_floating_point)
-
 // is_array
 
 #if __has_keyword(__is_array)
@@ -1192,17 +1178,6 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_enum_v
 
 // is_arithmetic
 
-#if __has_keyword(__is_arithmetic)
-
-template<class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_arithmetic : _BoolConstant<__is_arithmetic(_Tp)> { };
-
-#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES)
-template <class _Tp>
-_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_arithmetic_v = __is_arithmetic(_Tp);
-#endif
-
-#else // __has_keyword(__is_arithmetic)
 
 template <class _Tp> struct _LIBCPP_TEMPLATE_VIS is_arithmetic
     : public integral_constant<bool, is_integral<_Tp>::value      ||
@@ -1214,8 +1189,6 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_arithmetic_v
     = is_arithmetic<_Tp>::value;
 #endif
 
-#endif // __has_keyword(__is_arithmetic)
-
 // is_fundamental
 
 // In clang 9 and lower, this builtin did not work for nullptr_t. Additionally, in C++03 mode,
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/pbackfail.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/pbackfail.pass.cpp
index 728eec2938e1..03aff1681227 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/pbackfail.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/pbackfail.pass.cpp
@@ -10,6 +10,8 @@
 
 // int_type pbackfail(int_type c = traits::eof());
 
+// FILE_DEPENDENCIES: underflow.dat
+
 #include <fstream>
 #include <cassert>
 
diff --git a/libcxx/utils/libcxx/compiler.py b/libcxx/utils/libcxx/compiler.py
index dd334cdbcff9..248b5ba8a98c 100644
--- a/libcxx/utils/libcxx/compiler.py
+++ b/libcxx/utils/libcxx/compiler.py
@@ -29,6 +29,7 @@ def __init__(self, config, path, flags=None, compile_flags=None, link_flags=None
         self.flags = list(flags or [])
         self.compile_flags = list(compile_flags or [])
         self.link_flags = list(link_flags or [])
+        self.link_libcxxabi_flag = '-lc++abi'
         self.warning_flags = list(warning_flags or [])
         self.verify_supported = verify_supported
         self.use_verify = use_verify
diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py
index 9c154bafa808..20d0a796a3af 100644
--- a/libcxx/utils/libcxx/test/config.py
+++ b/libcxx/utils/libcxx/test/config.py
@@ -822,6 +822,7 @@ def configure_link_flags_abi_library(self):
                     if cxxabi_library_root:
                         libname = self.make_static_lib_name('c++abi')
                         abs_path = os.path.join(cxxabi_library_root, libname)
+                        self.cxx.link_libcxxabi_flag = abs_path
                         self.cxx.link_flags += [abs_path]
                     else:
                         self.cxx.link_flags += ['-lc++abi']
@@ -1025,6 +1026,7 @@ def configure_substitutions(self):
         sub.append(('%{flags}',         ' '.join(map(pipes.quote, self.cxx.flags))))
         sub.append(('%{compile_flags}', ' '.join(map(pipes.quote, self.cxx.compile_flags))))
         sub.append(('%{link_flags}',    ' '.join(map(pipes.quote, self.cxx.link_flags))))
+        sub.append(('%{link_libcxxabi}', pipes.quote(self.cxx.link_libcxxabi_flag)))
         if self.cxx.isVerifySupported():
             sub.append(('%{verify}', ' '.join(self.cxx.verify_flags)))
         # Add compile and build shortcuts
@@ -1048,7 +1050,6 @@ def configure_substitutions(self):
             exec_args.append('--host {}'.format(self.executor.user_prefix + self.executor.host))
             executor = os.path.join(self.libcxx_src_root, 'utils', 'ssh.py')
         else:
-            exec_args.append('--working_directory "%S"')
             executor = os.path.join(self.libcxx_src_root, 'utils', 'run.py')
         sub.append(('%{exec}', '{} {} {} -- '.format(pipes.quote(sys.executable),
                                                      pipes.quote(executor),
diff --git a/libcxx/utils/libcxx/test/executor.py b/libcxx/utils/libcxx/test/executor.py
index b555b1f03df9..c34310cdd2e2 100644
--- a/libcxx/utils/libcxx/test/executor.py
+++ b/libcxx/utils/libcxx/test/executor.py
@@ -10,6 +10,7 @@
 import os
 import posixpath
 import ntpath
+import shutil
 
 from libcxx.test import tracing
 from libcxx.util import executeCommand
@@ -61,6 +62,12 @@ def run(self, exe_path, cmd=None, work_dir='.', file_deps=None, env=None):
         if env:
             env = self.merge_environments(os.environ, env)
 
+        for dep in file_deps:
+            if os.path.isdir(dep):
+                shutil.copytree(dep, os.path.join(work_dir, os.path.basename(dep)), symlinks=True)
+            else:
+                shutil.copy2(dep, work_dir)
+
         out, err, rc = executeCommand(cmd, cwd=work_dir, env=env)
         return (cmd, out, err, rc)
 
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index ff23273ab9a6..1bc85f24976a 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -9,6 +9,8 @@
 import copy
 import errno
 import os
+import shutil
+import tempfile
 import time
 import random
 
@@ -125,8 +127,8 @@ def _execute(self, test, lit_config):
             lit_config.fatal('Unsupported RUN line found in test %s' % name)
 
         tmpDir, tmpBase = lit.TestRunner.getTempPaths(test)
-        substitutions = lit.TestRunner.getDefaultSubstitutions(test, tmpDir,
-                                                               tmpBase)
+        substitutions = lit.TestRunner.getDefaultSubstitutions(
+            test, tmpDir, tmpBase, normalize_slashes=self.execute_external)
 
         # Apply substitutions in FILE_DEPENDENCIES markup
         data_files = lit.TestRunner.applySubstitutions(test.file_dependencies, substitutions,
@@ -209,16 +211,21 @@ def _evaluate_pass_test(self, test, tmpBase, lit_config,
                 report += "Compilation failed unexpectedly!"
                 return lit.Test.Result(lit.Test.FAIL, report)
             # Run the test
-            local_cwd = os.path.dirname(source_path)
             env = None
             if self.exec_env:
                 env = self.exec_env
 
             max_retry = test.allowed_retries + 1
             for retry_count in range(max_retry):
-                cmd, out, err, rc = self.executor.run(exec_path, [exec_path],
-                                                      local_cwd, data_files,
-                                                      env)
+                # Create a temporary directory just for that test and run the
+                # test in that directory
+                try:
+                    execDirTmp = tempfile.mkdtemp(dir=execDir)
+                    cmd, out, err, rc = self.executor.run(exec_path, [exec_path],
+                                                          execDirTmp, data_files,
+                                                          env)
+                finally:
+                    shutil.rmtree(execDirTmp)
                 report = "Compiled With: '%s'\n" % ' '.join(compile_cmd)
                 report += libcxx.util.makeReport(cmd, out, err, rc)
                 if rc == 0:
diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 6a89a2b9388a..7de82c78dbfa 100644
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -14,14 +14,15 @@
 
 import argparse
 import os
+import shutil
 import subprocess
 import sys
+import tempfile
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--codesign_identity', type=str, required=False)
-    parser.add_argument('--working_directory', type=str, required=True)
     parser.add_argument('--dependencies', type=str, nargs='*', required=True)
     parser.add_argument('--env', type=str, nargs='*', required=True)
     (args, remaining) = parser.parse_known_args(sys.argv[1:])
@@ -42,14 +43,23 @@ def main():
     # Extract environment variables into a dictionary
     env = {k : v  for (k, v) in map(lambda s: s.split('=', 1), args.env)}
 
-    # Ensure the file dependencies exist
-    for file in args.dependencies:
-        if not os.path.exists(file):
-            sys.stderr.write('Missing file {} marked as a dependency of a test'.format(file))
-            exit(1)
+    try:
+        tmpDir = tempfile.mkdtemp()
 
-    # Run the executable with the given environment in the given working directory
-    return subprocess.call(' '.join(remaining), cwd=args.working_directory, env=env, shell=True)
+        # Ensure the file dependencies exist and copy them to a temporary directory.
+        for dep in args.dependencies:
+            if not os.path.exists(dep):
+                sys.stderr.write('Missing file or directory "{}" marked as a dependency of a test'.format(dep))
+                exit(1)
+            if os.path.isdir(dep):
+                shutil.copytree(dep, os.path.join(tmpDir, os.path.basename(dep)), symlinks=True)
+            else:
+                shutil.copy2(dep, tmpDir)
+
+        # Run the executable with the given environment in the temporary directory.
+        return subprocess.call(' '.join(remaining), cwd=tmpDir, env=env, shell=True)
+    finally:
+        shutil.rmtree(tmpDir)
 
 if __name__ == '__main__':
     exit(main())
diff --git a/libcxx/utils/ssh.py b/libcxx/utils/ssh.py
index 20acaeb00e3d..f6b9574e39b2 100644
--- a/libcxx/utils/ssh.py
+++ b/libcxx/utils/ssh.py
@@ -15,8 +15,11 @@
 
 import argparse
 import os
+import posixpath
 import subprocess
 import sys
+import tarfile
+import tempfile
 
 
 def main():
@@ -29,62 +32,77 @@ def main():
 
     if len(remaining) < 2:
         sys.stderr.write('Missing actual commands to run')
-        exit(1)
-    remaining = remaining[1:] # Skip the '--'
+        return 1
 
-    # HACK:
-    # If the first argument is a file that ends in `.tmp.exe`, assume it is
-    # the name of an executable generated by a test file. This allows us to
-    # do custom processing like codesigning the executable and changing its
-    # path when running on the remote host. It's possible for there to be no
-    # such executable, for example in the case of a .sh.cpp test.
-    exe = None
-    if os.path.exists(remaining[0]) and remaining[0].endswith('.tmp.exe'):
-        exe = remaining.pop(0)
-
-    # If there's an executable, do any necessary codesigning.
-    if exe and args.codesign_identity:
-        rc = subprocess.call(['xcrun', 'codesign', '-f', '-s', args.codesign_identity, exe], env={})
-        if rc != 0:
-            sys.stderr.write('Failed to codesign: {}'.format(exe))
-            return rc
+    commandLine = remaining[1:] # Skip the '--'
 
     ssh = lambda command: ['ssh', '-oBatchMode=yes', args.host, command]
-    scp = lambda src, dst: ['scp', '-oBatchMode=yes', '-r', src, '{}:{}'.format(args.host, dst)]
-
-    # Create a temporary directory where the test will be run
-    tmp = subprocess.check_output(ssh('mktemp -d /tmp/libcxx.XXXXXXXXXX')).strip()
-
-    # Ensure the test dependencies exist and scp them to the temporary directory.
-    # Test dependencies can be either files or directories, so the `scp` command
-    # needs to use `-r`.
-    for dep in args.dependencies:
-        if not os.path.exists(dep):
-            sys.stderr.write('Missing file or directory {} marked as a dependency of a test'.format(dep))
-            exit(1)
-        subprocess.call(scp(dep, tmp))
-
-    # If there's an executable, change its path to be in the temporary directory.
-    # We know it has been copied to the remote host when we handled the test
-    # dependencies above.
-    if exe:
-        exe = os.path.join(tmp, os.path.basename(exe))
-
-    # If there's an executable, make sure it has 'execute' permissions on the
-    # remote host. The host that compiled the executable might not have a notion
-    # of 'executable' permissions.
-    if exe:
-        subprocess.call(ssh('chmod +x {}'.format(exe)))
-
-    # Execute the command through SSH in the temporary directory, with the
-    # correct environment.
-    command = [exe] + remaining if exe else remaining
-    res = subprocess.call(ssh('cd {} && env -i {} {}'.format(tmp, ' '.join(args.env), ' '.join(command))))
-
-    # Remove the temporary directory when we're done.
-    subprocess.call(ssh('rm -r {}'.format(tmp)))
-
-    return res
+    scp = lambda src, dst: ['scp', '-oBatchMode=yes', src, '{}:{}'.format(args.host, dst)]
+
+    # Create a temporary directory where the test will be run.
+    tmp = subprocess.check_output(ssh('mktemp -d /tmp/libcxx.XXXXXXXXXX'), universal_newlines=True).strip()
+
+    # HACK:
+    # If an argument is a file that ends in `.tmp.exe`, assume it is the name
+    # of an executable generated by a test file. We call these test-executables
+    # below. This allows us to do custom processing like codesigning test-executables
+    # and changing their path when running on the remote host. It's also possible
+    # for there to be no such executable, for example in the case of a .sh.cpp
+    # test.
+    isTestExe = lambda exe: exe.endswith('.tmp.exe') and os.path.exists(exe)
+    pathOnRemote = lambda file: posixpath.join(tmp, os.path.basename(file))
+
+    try:
+        # Do any necessary codesigning of test-executables found in the command line.
+        if args.codesign_identity:
+            for exe in filter(isTestExe, commandLine):
+                subprocess.check_call(['xcrun', 'codesign', '-f', '-s', args.codesign_identity, exe], env={})
+
+        # Ensure the test dependencies exist, tar them up and copy the tarball
+        # over to the remote host.
+        with tempfile.NamedTemporaryFile(suffix='.tar') as tmpTar:
+            with tarfile.open(fileobj=tmpTar, mode='w') as tarball:
+                for dep in args.dependencies:
+                    if not os.path.exists(dep):
+                        sys.stderr.write('Missing file or directory "{}" marked as a dependency of a test'.format(dep))
+                        return 1
+                    tarball.add(dep, arcname=os.path.basename(dep))
+
+            remoteTarball = pathOnRemote(tmpTar.name)
+            tmpTar.flush()
+            subprocess.check_call(scp(tmpTar.name, remoteTarball))
+
+        # Untar the dependencies in the temporary directory and remove the tarball.
+        remoteCommands = [
+            'tar -xf {} -C {}'.format(remoteTarball, tmp),
+            'rm {}'.format(remoteTarball)
+        ]
+
+        # Make sure all test-executables in the remote command line have 'execute'
+        # permissions on the remote host. The host that compiled the test-executable
+        # might not have a notion of 'executable' permissions.
+        for exe in map(pathOnRemote, filter(isTestExe, commandLine)):
+            remoteCommands.append('chmod +x {}'.format(exe))
+
+        # Execute the command through SSH in the temporary directory, with the
+        # correct environment. We tweak the command line to run it on the remote
+        # host by transforming the path of test-executables to their path in the
+        # temporary directory, where we know they have been copied when we handled
+        # test dependencies above.
+        remoteCommands += [
+            'cd {}'.format(tmp),
+            'export {}'.format(' '.join(args.env)),
+            ' '.join(pathOnRemote(x) if isTestExe(x) else x for x in commandLine)
+        ]
+
+        # Finally, SSH to the remote host and execute all the commands.
+        rc = subprocess.call(ssh(' && '.join(remoteCommands)))
+        return rc
+
+    finally:
+        # Make sure the temporary directory is removed when we're done.
+        subprocess.check_call(ssh('rm -r {}'.format(tmp)))
+
 
 if __name__ == '__main__':
     exit(main())
diff --git a/libcxxabi/test/incomplete_type.sh.cpp b/libcxxabi/test/incomplete_type.sh.cpp
index 71147241b760..a762d2f9c60e 100644
--- a/libcxxabi/test/incomplete_type.sh.cpp
+++ b/libcxxabi/test/incomplete_type.sh.cpp
@@ -15,14 +15,15 @@
 
 // UNSUPPORTED: libcxxabi-no-exceptions
 
-// NOTE: Pass -lc++abi explicitly and before -lc++ so that -lc++ doesn't drag
+// NOTE: Link libc++abi explicitly and before libc++ so that libc++ doesn't drag
 // in the system libc++abi installation on OS X. (DYLD_LIBRARY_PATH is ignored
 // for shell tests because of Apple security features).
 
+// FILE_DEPENDENCIES: %t.exe
 // RUN: %{cxx} %{flags} %{compile_flags} -c %s -o %t.one.o
 // RUN: %{cxx} %{flags} %{compile_flags} -c %s -o %t.two.o -DTU_ONE
-// RUN: %{cxx} %{flags} %t.one.o %t.two.o -lc++abi %{link_flags} -o %t.exe
-// RUN: %t.exe
+// RUN: %{cxx} %{flags} %t.one.o %t.two.o %{link_libcxxabi} %{link_flags} -o %t.exe
+// RUN: %{exec} %t.exe
 
 #include <stdio.h>
 #include <cstring>
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index d2af7f50cc4c..fc212f539731 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -137,7 +137,7 @@ getRelaTocSymAndAddend(InputSectionBase *tocSec, uint64_t offset) {
 
 // When accessing a symbol defined in another translation unit, compilers
 // reserve a .toc entry, allocate a local label and generate toc-indirect
-// instuctions:
+// instructions:
 //
 //   addis 3, 2, .LC0@toc@ha  # R_PPC64_TOC16_HA
 //   ld    3, .LC0@toc@l(3)   # R_PPC64_TOC16_LO_DS, load the address from a .toc entry
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
index 6dad7c965f1a..9be27986693c 100644
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp
@@ -263,7 +263,7 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
 // Sort sections by the profile data provided by -callgraph-profile-file
 //
 // This first builds a call graph based on the profile data then merges sections
-// according to the C³ huristic. All clusters are then sorted by a density
+// according to the C³ heuristic. All clusters are then sorted by a density
 // metric to further improve locality.
 DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder() {
   return CallGraphSort().run();
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index b3031a3723c2..e93dec947d90 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -708,7 +708,7 @@ static uint64_t getRelocTargetVA(const InputFile *file, RelType type, int64_t a,
     // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
     // microMIPS variants of these relocations use slightly different
     // expressions: AHL + GP - P + 3 for %lo() and AHL + GP - P - 1 for %hi()
-    // to correctly handle less-sugnificant bit of the microMIPS symbol.
+    // to correctly handle less-significant bit of the microMIPS symbol.
     uint64_t v = in.mipsGot->getGp(file) + a - p;
     if (type == R_MIPS_LO16 || type == R_MICROMIPS_LO16)
       v += 4;
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index f52dc701541f..4d507ee9c2c9 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -833,6 +833,7 @@ void LinkerScript::assignOffsets(OutputSection *sec) {
   if (!(sec->flags & SHF_ALLOC))
     dot = 0;
 
+  bool prevLMARegionIsDefault = ctx->lmaRegion == nullptr;
   ctx->memRegion = sec->memRegion;
   ctx->lmaRegion = sec->lmaRegion;
   if (ctx->memRegion)
@@ -851,19 +852,19 @@ void LinkerScript::assignOffsets(OutputSection *sec) {
 
   switchTo(sec);
 
-  ctx->lmaOffset = 0;
-
+  // ctx->lmaOffset is LMA minus VMA. If LMA is explicitly specified via AT() or
+  // AT>, recompute ctx->lmaOffset; otherwise, if both previous/current LMA
+  // region is the default, reuse previous lmaOffset; otherwise, reset lmaOffset
+  // to 0. This emulates heuristics described in
+  // https://sourceware.org/binutils/docs/ld/Output-Section-LMA.html
   if (sec->lmaExpr)
     ctx->lmaOffset = sec->lmaExpr().getValue() - dot;
-  if (MemoryRegion *mr = sec->lmaRegion)
+  else if (MemoryRegion *mr = sec->lmaRegion)
     ctx->lmaOffset = alignTo(mr->curPos, sec->alignment) - dot;
+  else if (!prevLMARegionIsDefault)
+    ctx->lmaOffset = 0;
 
-  // If neither AT nor AT> is specified for an allocatable section, the linker
-  // will set the LMA such that the difference between VMA and LMA for the
-  // section is the same as the preceding output section in the same region
-  // https://sourceware.org/binutils/docs-2.20/ld/Output-Section-LMA.html
-  // This, however, should only be done by the first "non-header" section
-  // in the segment.
+  // Propagate ctx->lmaOffset to the first "non-header" section.
   if (PhdrEntry *l = ctx->outSec->ptLoad)
     if (sec == findFirstSection(l))
       l->lmaOffset = ctx->lmaOffset;
diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
index 7453acfcf396..200ade76d306 100644
--- a/lld/ELF/ScriptLexer.cpp
+++ b/lld/ELF/ScriptLexer.cpp
@@ -187,7 +187,7 @@ static std::vector<StringRef> tokenizeExpr(StringRef s) {
       break;
     }
 
-    // Get a token before the opreator.
+    // Get a token before the operator.
     if (e != 0)
       ret.push_back(s.substr(0, e));
 
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index cb7ee8829e53..42c8a71c4185 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -31,7 +31,18 @@ static std::string demangle(StringRef symName) {
   return std::string(symName);
 }
 
-std::string toString(const elf::Symbol &b) { return demangle(b.getName()); }
+std::string toString(const elf::Symbol &sym) {
+  StringRef name = sym.getName();
+  std::string ret = demangle(name);
+
+  // If sym has a non-default version, its name may have been truncated at '@'
+  // by Symbol::parseSymbolVersion(). Add the trailing part. This check is safe
+  // because every symbol name ends with '\0'.
+  if (name.data()[name.size()] == '@')
+    ret += name.data() + name.size();
+  return ret;
+}
+
 std::string toELFString(const Archive::Symbol &b) {
   return demangle(b.getName());
 }
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index ac606198afd8..ebee4af1fdad 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -21,6 +21,7 @@
 #include "llvm/Object/ELF.h"
 
 namespace lld {
+// Returns a string representation for a symbol for diagnostics.
 std::string toString(const elf::Symbol &);
 
 // There are two different ways to convert an Archive::Symbol to a string:
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 48339099dc27..8cf813ceffd0 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -2151,7 +2151,7 @@ template <class ELFT> void SymbolTableSection<ELFT>::writeTo(uint8_t *buf) {
       eSym->st_size = sym->getSize();
 
     // st_value is usually an address of a symbol, but that has a
-    // special meaining for uninstantiated common symbols (this can
+    // special meaning for uninstantiated common symbols (this can
     // occur if -r is given).
     if (BssSection *commonSec = getCommonSec(ent.sym))
       eSym->st_value = commonSec->alignment;
@@ -2250,7 +2250,7 @@ size_t SymtabShndxSection::getSize() const {
 // DSOs. That means resolving all dynamic symbols takes O(m)*O(n)
 // where m is the number of DSOs and n is the number of dynamic
 // symbols. For modern large programs, both m and n are large.  So
-// making each step faster by using hash tables substiantially
+// making each step faster by using hash tables substantially
 // improves time to load programs.
 //
 // (Note that this is not the only way to design the shared library.
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 190a4fd3ac9e..05eee24ce34f 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -364,7 +364,7 @@ class MipsGotSection final : public SyntheticSection {
 
   // Try to merge two GOTs. In case of success the `Dst` contains
   // result of merging and the function returns true. In case of
-  // ovwerflow the `Dst` is unchanged and the function returns false.
+  // overflow the `Dst` is unchanged and the function returns false.
   bool tryMergeGots(FileGot & dst, FileGot & src, bool isPrimary);
 };
 
diff --git a/lld/docs/ELF/linker_script.rst b/lld/docs/ELF/linker_script.rst
index 5b904bb3a1e1..c5115c1c9d6f 100644
--- a/lld/docs/ELF/linker_script.rst
+++ b/lld/docs/ELF/linker_script.rst
@@ -51,3 +51,27 @@ sh_addralign of an *OutputSection* *S* is the maximum of
 
 When an *OutputSection* *S* has both ``address`` and ``ALIGN(section_align)``,
 GNU ld will set sh_addralign to ``ALIGN(section_align)``.
+
+Output section LMA
+------------------
+
+A load address (LMA) can be specified by ``AT(lma)`` or ``AT>lma_region``.
+
+- ``AT(lma)`` specifies the exact load address. If the linker script does not
+  have a PHDRS command, then a new loadable segment will be generated.
+- ``AT>lma_region`` specifies the LMA region. The lack of ``AT>lma_region``
+  means the default region is used. Note, GNU ld propagates the previous LMA
+  memory region when ``address`` is not specified. The LMA is set to the
+  current location of the memory region aligned to the section alignment.
+  If the linker script does not have a PHDRS command, then if
+  ``lma_region`` is different from the ``lma_region`` for
+  the previous OutputSection a new loadable segment will be generated.
+
+The two keywords cannot be specified at the same time.
+
+If neither ``AT(lma)`` nor ``AT>lma_region`` is specified:
+
+- If the previous section is also in the default LMA region, the difference
+  between the LMA and the VMA is computed to be the same as the previous
+  difference.
+- Otherwise, the LMA is set to the VMA.
diff --git a/lld/include/lld/Core/Reference.h b/lld/include/lld/Core/Reference.h
index 4769882cde50..b104f8495474 100644
--- a/lld/include/lld/Core/Reference.h
+++ b/lld/include/lld/Core/Reference.h
@@ -91,7 +91,7 @@ class Reference {
   /// Some relocations require a symbol and a value (e.g. foo + 4).
   virtual Addend addend() const = 0;
 
-  /// During linking, some optimzations may change addend value.
+  /// During linking, some optimizations may change addend value.
   virtual void setAddend(Addend) = 0;
 
   /// Returns target specific attributes of the reference.
diff --git a/lld/lib/ReaderWriter/MachO/ArchHandler_arm64.cpp b/lld/lib/ReaderWriter/MachO/ArchHandler_arm64.cpp
index a424edf4985a..bee081aec067 100644
--- a/lld/lib/ReaderWriter/MachO/ArchHandler_arm64.cpp
+++ b/lld/lib/ReaderWriter/MachO/ArchHandler_arm64.cpp
@@ -61,7 +61,7 @@ class ArchHandler_arm64 : public ArchHandler {
 
   /// Used by GOTPass to update GOT References.
   void updateReferenceToGOT(const Reference *ref, bool targetNowGOT) override {
-    // If GOT slot was instanciated, transform:
+    // If GOT slot was instantiated, transform:
     //   gotPage21/gotOffset12 -> page21/offset12scale8
     // If GOT slot optimized away, transform:
     //   gotPage21/gotOffset12 -> page21/addOffset12
diff --git a/lld/lib/ReaderWriter/MachO/CompactUnwindPass.cpp b/lld/lib/ReaderWriter/MachO/CompactUnwindPass.cpp
index 94a105a6f159..f3636feb217b 100644
--- a/lld/lib/ReaderWriter/MachO/CompactUnwindPass.cpp
+++ b/lld/lib/ReaderWriter/MachO/CompactUnwindPass.cpp
@@ -576,5 +576,5 @@ void addCompactUnwindPass(PassManager &pm, const MachOLinkingContext &ctx) {
   pm.add(std::make_unique<CompactUnwindPass>(ctx));
 }
 
-} // end namesapce mach_o
-} // end namesapce lld
+} // end namespace mach_o
+} // end namespace lld
diff --git a/lld/lib/ReaderWriter/MachO/GOTPass.cpp b/lld/lib/ReaderWriter/MachO/GOTPass.cpp
index 0f80dfa19d09..10e611c1bd2b 100644
--- a/lld/lib/ReaderWriter/MachO/GOTPass.cpp
+++ b/lld/lib/ReaderWriter/MachO/GOTPass.cpp
@@ -179,5 +179,5 @@ void addGOTPass(PassManager &pm, const MachOLinkingContext &ctx) {
   pm.add(std::make_unique<GOTPass>(ctx));
 }
 
-} // end namesapce mach_o
-} // end namesapce lld
+} // end namespace mach_o
+} // end namespace lld
diff --git a/lld/lib/ReaderWriter/MachO/TLVPass.cpp b/lld/lib/ReaderWriter/MachO/TLVPass.cpp
index 5f457b863d90..e0a031cfb07b 100644
--- a/lld/lib/ReaderWriter/MachO/TLVPass.cpp
+++ b/lld/lib/ReaderWriter/MachO/TLVPass.cpp
@@ -136,5 +136,5 @@ void addTLVPass(PassManager &pm, const MachOLinkingContext &ctx) {
   pm.add(std::make_unique<TLVPass>(ctx));
 }
 
-} // end namesapce mach_o
-} // end namesapce lld
+} // end namespace mach_o
+} // end namespace lld
diff --git a/lld/test/COFF/secidx-absolute.s b/lld/test/COFF/secidx-absolute.s
index 0b467bbb09bf..8befaf2f456c 100644
--- a/lld/test/COFF/secidx-absolute.s
+++ b/lld/test/COFF/secidx-absolute.s
@@ -3,7 +3,7 @@
 # RUN: lld-link -entry:main -nodefaultlib %t.obj -out:%t.exe
 # RUN: llvm-readobj %t.exe -sections -section-data | FileCheck %s
 
-# Section relocations against absolute symbols resolve to the last real ouput
+# Section relocations against absolute symbols resolve to the last real output
 # section index plus one.
 
 .text
diff --git a/lld/test/ELF/Inputs/undef-bad-debug.s b/lld/test/ELF/Inputs/undef-bad-debug.s
index bf517f3ea1cd..d3171f023616 100644
--- a/lld/test/ELF/Inputs/undef-bad-debug.s
+++ b/lld/test/ELF/Inputs/undef-bad-debug.s
@@ -16,7 +16,7 @@ sym4:
     .long .Lprologue_end - .Lprologue_start # prologue length
 .Lprologue_start:
     .byte 1                         # minimum instruction length
-    .byte 1                         # maximum operatiosn per instruction
+    .byte 1                         # maximum operations per instruction
     .byte 1                         # default is_stmt
     .byte -5                        # line base
     .byte 14                        # line range
@@ -59,7 +59,7 @@ sym4:
     .long .Lprologue2_end - .Lprologue2_start # prologue length
 .Lprologue2_start:
     .byte 1                         # minimum instruction length
-    .byte 1                         # maximum operatiosn per instruction
+    .byte 1                         # maximum operations per instruction
     .byte 1                         # default is_stmt
     .byte -5                        # line base
     .byte 14                        # line range
diff --git a/lld/test/ELF/allow-multiple-definition.s b/lld/test/ELF/allow-multiple-definition.s
index 8a3b97c0102e..bcf5cc7213c3 100644
--- a/lld/test/ELF/allow-multiple-definition.s
+++ b/lld/test/ELF/allow-multiple-definition.s
@@ -14,7 +14,7 @@
 # RUN: llvm-objdump -d %t3 | FileCheck %s
 # RUN: llvm-objdump -d %t4 | FileCheck --check-prefix=REVERT %s
 
-# inputs contain different constants for instuction movl.
+# inputs contain different constants for instruction movl.
 # Tests below checks that order of files in command line
 # affects on what symbol will be used.
 # If flag allow-multiple-definition is enabled the first
diff --git a/lld/test/ELF/arm-exidx-add-missing.s b/lld/test/ELF/arm-exidx-add-missing.s
index 1beaa299a82e..da3c271f2355 100644
--- a/lld/test/ELF/arm-exidx-add-missing.s
+++ b/lld/test/ELF/arm-exidx-add-missing.s
@@ -10,7 +10,7 @@
 // The range of addresses covered by the table entry is terminated by the
 // next table entry. This means that an executable section without a .ARM.exidx
 // section does not terminate the range of addresses. To fix this the linker
-// synthesises an EXIDX_CANTUNWIND entry for each section wihout a .ARM.exidx
+// synthesises an EXIDX_CANTUNWIND entry for each section without a .ARM.exidx
 // section.
 
         .syntax unified
diff --git a/lld/test/ELF/executable-undefined-ignoreall.s b/lld/test/ELF/executable-undefined-ignoreall.s
index a479317c0f71..cc38e17cdf61 100644
--- a/lld/test/ELF/executable-undefined-ignoreall.s
+++ b/lld/test/ELF/executable-undefined-ignoreall.s
@@ -1,7 +1,7 @@
 # REQUIRES: x86
 
 ## --unresolved-symbols=ignore-all behaves similar to -shared:
-## for PLT relocations to undefined symbols, produce dynamic reloctions if we
+## for PLT relocations to undefined symbols, produce dynamic relocations if we
 ## emit .dynsym.
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
diff --git a/lld/test/ELF/icf-absolute2.s b/lld/test/ELF/icf-absolute2.s
index 37e26a9d9c46..5cf1d69bdecf 100644
--- a/lld/test/ELF/icf-absolute2.s
+++ b/lld/test/ELF/icf-absolute2.s
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %S/Inputs/icf-absolute2.s -o %t2
 # RUN: ld.lld %t %t2 -o /dev/null --icf=all --print-icf-sections | FileCheck -allow-empty %s
 
-## Test we do not crash and do not fold sections which relocations reffering to
+## Test we do not crash and do not fold sections which relocations referring to
 ## absolute symbols with a different values.
 # CHECK-NOT: selected
 
diff --git a/lld/test/ELF/invalid/broken-relaxation-x64.test b/lld/test/ELF/invalid/broken-relaxation-x64.test
index 1bd7fe0cc36d..97a977e2c03a 100644
--- a/lld/test/ELF/invalid/broken-relaxation-x64.test
+++ b/lld/test/ELF/invalid/broken-relaxation-x64.test
@@ -8,7 +8,7 @@
 ## YAML below contains 2 relocations of type R_X86_64_GOTTPOFF, and a .text
 ## with fake content filled by 0xFF. That means instructions for relaxation are
 ## "broken", so they does not match any known valid relaxations. We also generate
-## .tls section because we need it for correct proccessing of STT_TLS symbol.
+## .tls section because we need it for correct processing of STT_TLS symbol.
 !ELF
 FileHeader:
   Class:           ELFCLASS64
@@ -44,4 +44,4 @@ Symbols:
     Value:    0x12345
     Size:     4
     Binding:  STB_GLOBAL
- 
\ No newline at end of file
+ 
diff --git a/lld/test/ELF/linkerscript/align1.test b/lld/test/ELF/linkerscript/align1.test
index e7b65fc75ffe..62dfdbfc5bd9 100644
--- a/lld/test/ELF/linkerscript/align1.test
+++ b/lld/test/ELF/linkerscript/align1.test
@@ -34,7 +34,7 @@ SECTIONS {
 # RUN: ld.lld -o %t5 --script %t.script %t.o
 # RUN: llvm-objdump --section-headers %t5 | FileCheck %s --check-prefix=ZERO
 
-## Test we fail gracefuly when alignment value is not a power of 2 (#1).
+## Test we fail gracefully when alignment value is not a power of 2 (#1).
 # RUN: echo "SECTIONS { . = 0x123; . = ALIGN(0x123, 3); .aaa : { *(.aaa) } }" > %t.script
 # RUN: not ld.lld -o /dev/null --script %t.script %t.o 2>&1 | FileCheck -check-prefix=ERR %s
 
diff --git a/lld/test/ELF/linkerscript/at4.s b/lld/test/ELF/linkerscript/at4.s
deleted file mode 100644
index a52a33e5cee4..000000000000
--- a/lld/test/ELF/linkerscript/at4.s
+++ /dev/null
@@ -1,28 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
-# RUN: echo "SECTIONS { \
-# RUN:  . = 0x1000; \
-# RUN:  .aaa : { *(.aaa) } \
-# RUN:  .bbb : AT(0x2008) { *(.bbb) } \
-# RUN:  .ccc : { *(.ccc) } \
-# RUN: }" > %t.script
-# RUN: ld.lld %t.o --script %t.script -o %t
-# RUN: llvm-readelf -l %t | FileCheck %s
-
-# CHECK:      Type  Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
-# CHECK-NEXT: LOAD  0x001000 0x0000000000001000 0x0000000000001000 0x000008 0x000008 R   0x1000
-# CHECK-NEXT: LOAD  0x001008 0x0000000000001008 0x0000000000002008 0x000010 0x000010 R   0x1000
-# CHECK-NEXT: LOAD  0x001018 0x0000000000001018 0x0000000000001018 0x000001 0x000001 R E 0x1000
-
-.global _start
-_start:
- nop
-
-.section .aaa, "a"
-.quad 0
-
-.section .bbb, "a"
-.quad 0
-
-.section .ccc, "a"
-.quad 0
diff --git a/lld/test/ELF/linkerscript/lma-offset.s b/lld/test/ELF/linkerscript/lma-offset.s
new file mode 100644
index 000000000000..3c739724538e
--- /dev/null
+++ b/lld/test/ELF/linkerscript/lma-offset.s
@@ -0,0 +1,39 @@
+# REQUIRES: x86
+## Test the difference between the VMA and the LMA for sections with AT().
+
+# RUN: echo '.globl _start; _start: ret; \
+# RUN:   .section .a,"a"; .byte 0; \
+# RUN:   .section .b,"a"; .byte 0; \
+# RUN:   .section .c,"a"; .byte 0; \
+# RUN:   .section .d,"a"; .byte 0; \
+# RUN:   .data; .byte 0' | \
+# RUN:   llvm-mc -filetype=obj -triple=x86_64 - -o %t.o
+# RUN: ld.lld -T %s %t.o -o %t
+# RUN: llvm-readelf -l %t | FileCheck %s
+
+# CHECK:      Type  Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT: LOAD  0x001000 0x0000000000001000 0x0000000000001000 0x000001 0x000001 R   0x1000
+
+## .b has AT(). It starts a PT_LOAD segment which also includes .c
+# CHECK-NEXT: LOAD  0x001001 0x0000000000001001 0x0000000000002005 0x000002 0x000002 R   0x1000
+
+## .d has AT(). It starts a PT_LOAD segment, even if the difference between
+## LMA and VMA (0x2007-0x1003) is the same as the previous one.
+# CHECK-NEXT: LOAD  0x001003 0x0000000000001003 0x0000000000002007 0x000001 0x000001 R   0x1000
+
+## The orphan section .text starts a PT_LOAD segment. The difference between
+## LMA and VMA (0x2008-0x1004) remains the same
+# CHECK-NEXT: LOAD  0x001004 0x0000000000001004 0x0000000000002008 0x000001 0x000001 R E 0x1000
+
+## .data starts a PT_LOAD segment. The difference remains the same.
+# CHECK-NEXT: LOAD  0x001005 0x0000000000001005 0x0000000000002009 0x000001 0x000001 RW  0x1000
+
+SECTIONS {
+  . = 0x1000;
+  .a : { *(.a) }
+  .b : AT(0x2005) { *(.b) }
+  .c : { *(.c) }
+  .d : AT(0x2007) { *(.d) }
+  ## Orphan section .text will be inserted here.
+  .data : { *(.data) }
+}
diff --git a/lld/test/ELF/linkerscript/loadaddr.s b/lld/test/ELF/linkerscript/loadaddr.s
index e2c82fc6c8cb..055b7422baeb 100644
--- a/lld/test/ELF/linkerscript/loadaddr.s
+++ b/lld/test/ELF/linkerscript/loadaddr.s
@@ -22,7 +22,7 @@
 # CHECK-NEXT: 0000000000002008 g       *ABS*  0000000000000000 bbb_lma
 # CHECK-NEXT: 0000000000003000 g       *ABS*  0000000000000000 ccc_lma
 # CHECK-NEXT: 0000000000004000 g       *ABS*  0000000000000000 ddd_lma
-# CHECK-NEXT: 0000000000001020 g       *ABS*  0000000000000000 txt_lma
+# CHECK-NEXT: 0000000000004008 g       *ABS*  0000000000000000 txt_lma
 # ERROR: {{.*}}.script:1: undefined section .zzz
 
 .global _start
diff --git a/lld/test/ELF/linkerscript/map-file2.test b/lld/test/ELF/linkerscript/map-file2.test
index f527e8ecdf80..535043282249 100644
--- a/lld/test/ELF/linkerscript/map-file2.test
+++ b/lld/test/ELF/linkerscript/map-file2.test
@@ -32,10 +32,10 @@ SECTIONS {
 # CHECK-NEXT:       1219             3209        8     1         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.ddd)
 # CHECK-NEXT:       1228             3218       34     8 .eh_frame
 # CHECK-NEXT:       1228             3218       30     1         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.eh_frame+0x0)
-# CHECK-NEXT:       125c             125c        1     4 .text
-# CHECK-NEXT:       125c             125c        1     4         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.text)
-# CHECK-NEXT:       125c             125c        0     1                 f(int)
-# CHECK-NEXT:       125c             125c        0     1                 _start
+# CHECK-NEXT:       125c             324c        1     4 .text
+# CHECK-NEXT:       125c             324c        1     4         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.text)
+# CHECK-NEXT:       125c             324c        0     1                 f(int)
+# CHECK-NEXT:       125c             324c        0     1                 _start
 # CHECK-NEXT:          0                0        8     1 .comment
 # CHECK-NEXT:          0                0        8     1         <internal>:(.comment)
 # CHECK-NEXT:          0                0       48     8 .symtab
diff --git a/lld/test/ELF/linkerscript/overlay.test b/lld/test/ELF/linkerscript/overlay.test
index 85e140d60ab0..2d3c88759c63 100644
--- a/lld/test/ELF/linkerscript/overlay.test
+++ b/lld/test/ELF/linkerscript/overlay.test
@@ -28,4 +28,4 @@ SECTIONS {
 # CHECK:      Type Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
 # CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000004000 0x000008 0x000008 R   0x1000
 # CHECK-NEXT: LOAD 0x002000 0x0000000000001000 0x0000000000004008 0x000004 0x000004 R   0x1000
-# CHECK-NEXT: LOAD 0x002008 0x0000000000001008 0x0000000000001008 0x000001 0x000001 R E 0x1000
+# CHECK-NEXT: LOAD 0x002008 0x0000000000001008 0x0000000000004010 0x000001 0x000001 R E 0x1000
diff --git a/lld/test/ELF/linkerscript/subalign.s b/lld/test/ELF/linkerscript/subalign.s
index bf812d17bb87..a817a6993314 100644
--- a/lld/test/ELF/linkerscript/subalign.s
+++ b/lld/test/ELF/linkerscript/subalign.s
@@ -34,7 +34,7 @@
 # RUN: ld.lld %t1.o --script %t4.script -o %t4
 # RUN: llvm-objdump -s %t4 | FileCheck --check-prefix=SUBALIGN %s
 
-## Test we fail gracefuly when alignment value is not a power of 2.
+## Test we fail gracefully when alignment value is not a power of 2.
 # RUN: echo "SECTIONS { .aaa : SUBALIGN(3) { *(.aaa*) } }" > %t5.script
 # RUN: not ld.lld %t1.o --script %t5.script -o /dev/null 2>&1 | FileCheck --check-prefix=ERR2 %s
 # ERR2: {{.*}}.script:1: alignment must be power of 2
diff --git a/lld/test/ELF/lto/common4.ll b/lld/test/ELF/lto/common4.ll
index 7a40e4be32fa..1b041667fb7b 100644
--- a/lld/test/ELF/lto/common4.ll
+++ b/lld/test/ELF/lto/common4.ll
@@ -2,7 +2,7 @@
 
 ;; Make sure that common symbols are properly internalized.
 ;; In this file, @a does not interpose any symbol in a DSO,
-;; so LTO should be able to internelize it.
+;; so LTO should be able to internalize it.
 
 ; RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.so.o
 ; RUN: ld.lld -shared -o %t.so %t.so.o
diff --git a/lld/test/ELF/mips-sto-pic-flag.s b/lld/test/ELF/mips-sto-pic-flag.s
index c313a8eb3c7f..6c9a660efca4 100644
--- a/lld/test/ELF/mips-sto-pic-flag.s
+++ b/lld/test/ELF/mips-sto-pic-flag.s
@@ -1,7 +1,7 @@
 # REQUIRES: mips
 # In case of linking PIC and non-PIC code together and generation
 # of a relocatable object, all PIC symbols should have STO_MIPS_PIC
-# flag in the symbol table of the ouput file.
+# flag in the symbol table of the output file.
 
 # RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux %s -o %t-npic.o
 # RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux \
diff --git a/lld/test/ELF/pack-dyn-relocs.s b/lld/test/ELF/pack-dyn-relocs.s
index 34c9a3cb315c..6ecf354f68a2 100644
--- a/lld/test/ELF/pack-dyn-relocs.s
+++ b/lld/test/ELF/pack-dyn-relocs.s
@@ -69,7 +69,7 @@
 // ANDROID32-HEADERS: 0x6000000F ANDROID_REL          [[ADDR]]
 // ANDROID32-HEADERS: 0x60000010 ANDROID_RELSZ        [[SIZE]]
 
-// Packed should have the groups of non-relative reloations first, followed
+// Packed should have the groups of non-relative relocations first, followed
 // by the larger groups of relative relocations (i.e. the 8 and 9 followed
 // by the 7.)
 // ANDROID32:          Section ({{.+}}) .rel.dyn {
diff --git a/lld/test/ELF/ppc32-call-stub-pic.s b/lld/test/ELF/ppc32-call-stub-pic.s
index 2c116c596e86..179874620f31 100644
--- a/lld/test/ELF/ppc32-call-stub-pic.s
+++ b/lld/test/ELF/ppc32-call-stub-pic.s
@@ -33,7 +33,7 @@
 
 ## .got2+0x8000-0x10004 = 0x30000+0x8000-0x10004 = 65536*2+32764
 # CHECK-LABEL: <_start>:
-# CHECK-NEXT:         bcl 20, 31, .+4
+# PIE-NEXT:           bcl 20, 31, 0x10210
 # PIE-NEXT:    10210: mflr 30
 # PIE-NEXT:           addis 30, 30, 3
 # PIE-NEXT:           addi 30, 30, -32412
@@ -52,6 +52,7 @@
 # PIE-NEXT:           bl 0x10274
 ## bl 00008000.plt_pic32.f
 # PIE-NEXT:           bl 0x10284
+# SHARED-NEXT:        bcl 20, 31, 0x10230
 # SHARED-NEXT: 10230: mflr 30
 # SHARED-NEXT:        addis 30, 30, 3
 # SHARED-NEXT:        addi 30, 30, -32420
@@ -116,9 +117,8 @@
 ## Operand of addi: 0x100a8-.glink = 24
 # CHECK-NEXT:         addis 11, 11, 0
 # CHECK-NEXT:         mflr 0
-# CHECK-NEXT:         bcl 20, 31, .+4
-# PIE-NEXT:    102ac: addi 11, 11, 24
-# SHARED-NEXT: 102cc: addi 11, 11, 24
+# CHECK-NEXT:         bcl 20, 31, 0x[[#%x,NEXT:]]
+# CHECK-NEXT: [[#%x,NEXT]]: addi 11, 11, 24
 
 # CHECK-NEXT: mflr 12
 # CHECK-NEXT: mtlr 0
diff --git a/lld/test/ELF/ppc32-long-thunk.s b/lld/test/ELF/ppc32-long-thunk.s
index 57c442f7723d..90c284ded5fa 100644
--- a/lld/test/ELF/ppc32-long-thunk.s
+++ b/lld/test/ELF/ppc32-long-thunk.s
@@ -40,7 +40,7 @@
 ## high-0x2028 = 0x02002008-0x2020 = 65536*512-24
 # PI:         <__LongThunk_high>:
 # PI-NEXT:        2018: mflr 0
-# PI-NEXT:              bcl 20, 31, .+4
+# PI-NEXT:              bcl 20, 31, 0x2020
 # PI-NEXT:        2020: mflr 12
 # PI-NEXT:              addis 12, 12, 512
 # PI-NEXT:              addi 12, 12, -24
@@ -51,7 +51,7 @@
 ## .text_high+16-0x2048 = 0x02002010-0x2048 = 65536*512-48
 # PI:         <__LongThunk_>:
 # PI-NEXT:        2038: mflr 0
-# PI-NEXT:              bcl 20, 31, .+4
+# PI-NEXT:              bcl 20, 31, 0x2040
 # PI-NEXT:        2040: mflr 12
 # PI-NEXT:              addis 12, 12, 512
 # PI-NEXT:              addi 12, 12, -48
diff --git a/lld/test/ELF/ppc32-reloc-rel.s b/lld/test/ELF/ppc32-reloc-rel.s
index 49d149550aae..b2bd0a461ca1 100644
--- a/lld/test/ELF/ppc32-reloc-rel.s
+++ b/lld/test/ELF/ppc32-reloc-rel.s
@@ -7,7 +7,7 @@
   beq 1f
 1:
 # CHECK-LABEL: section .R_PPC_REL14:
-# CHECK: bt 2, .+4
+# CHECK: 100100b4: bt 2, 0x100100b8
 
 .section .R_PPC_REL24,"ax",@progbits
   b 1f
diff --git a/lld/test/ELF/ppc64-local-dynamic.s b/lld/test/ELF/ppc64-local-dynamic.s
index d7013ee6d192..f0f3967387dd 100644
--- a/lld/test/ELF/ppc64-local-dynamic.s
+++ b/lld/test/ELF/ppc64-local-dynamic.s
@@ -95,7 +95,7 @@ k:
 // OutputRelocs-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend
 // OutputRelocs-NEXT: R_PPC64_DTPMOD64
 
-// Check that the got has 3 entries, 1 for the TOC and 1 stucture of 2 entries
+// Check that the got has 3 entries, 1 for the TOC and 1 structure of 2 entries
 // for the tls variables. Also verify the address so we can check the offsets
 // we calculate for each relocation type.
 // CheckGot: got          00000018 0000000000020100
diff --git a/lld/test/ELF/ppc64-reloc-rel.s b/lld/test/ELF/ppc64-reloc-rel.s
index be64a4f767ac..ea7367f38ca0 100644
--- a/lld/test/ELF/ppc64-reloc-rel.s
+++ b/lld/test/ELF/ppc64-reloc-rel.s
@@ -12,7 +12,7 @@
   beq 1f
 1:
 # CHECK-LABEL: Disassembly of section .R_PPC64_REL14:
-# CHECK: bt 2, .+4
+# CHECK: bt 2, 0x10010198
 
 .section .R_PPC64_REL16,"ax",@progbits
 .globl rel16
diff --git a/lld/test/ELF/ppc64-split-stack-adjust-overflow.s b/lld/test/ELF/ppc64-split-stack-adjust-overflow.s
index b1a104474f10..f9bbf6176af5 100644
--- a/lld/test/ELF/ppc64-split-stack-adjust-overflow.s
+++ b/lld/test/ELF/ppc64-split-stack-adjust-overflow.s
@@ -59,6 +59,6 @@ caller:
 # CHECK-NEXT: addis 12, 1, -32768
 # CHECK-NEXT: nop
 # CHECK-NEXT: cmpld 7, 12, 0
-# CHECK-NEXT: bt- 28, .+36
+# CHECK-NEXT: bt- 28, 0x10010204
 
 .section        .note.GNU-split-stack,"",@progbits
diff --git a/lld/test/ELF/ppc64-split-stack-adjust-size-success.s b/lld/test/ELF/ppc64-split-stack-adjust-size-success.s
index 63e0b414dc09..27fbb95c01df 100644
--- a/lld/test/ELF/ppc64-split-stack-adjust-size-success.s
+++ b/lld/test/ELF/ppc64-split-stack-adjust-size-success.s
@@ -58,21 +58,21 @@ caller:
 # CHECK-NEXT: addis 12, 1, -1
 # CHECK-NEXT: addi 12, 12, 32736
 # CHECK-NEXT: cmpld 7, 12, 0
-# CHECK-NEXT: bt- 28, .+36
+# CHECK-NEXT: bt- 28, 0x10010204
 
 # SMALL-LABEL: caller
 # SMALL:      ld 0, -28736(13)
 # SMALL-NEXT: addi 12, 1, -4128
 # SMALL-NEXT: nop
 # SMALL-NEXT: cmpld 7, 12, 0
-# SMALL-NEXT: bt- 28, .+36
+# SMALL-NEXT: bt- 28, 0x10010204
 
 # ZERO-LABEL: caller
 # ZERO:      ld 0, -28736(13)
 # ZERO-NEXT: addi 12, 1, -32
 # ZERO-NEXT: nop
 # ZERO-NEXT: cmpld 7, 12, 0
-# ZERO-NEXT: bt- 28, .+36
+# ZERO-NEXT: bt- 28, 0x10010204
         .p2align    2
         .global main
 	.type  main, @function
diff --git a/lld/test/ELF/ppc64-split-stack-prologue-adjust-success.s b/lld/test/ELF/ppc64-split-stack-prologue-adjust-success.s
index 197df150c495..002f80d5cb80 100644
--- a/lld/test/ELF/ppc64-split-stack-prologue-adjust-success.s
+++ b/lld/test/ELF/ppc64-split-stack-prologue-adjust-success.s
@@ -16,7 +16,7 @@
 
 # A caller with a stack that is small enough that the addis instruction
 # from the split-stack prologue is unneeded, and after the prologue adjustment
-# the stack size still fits whithin 16 bits.
+# the stack size still fits within 16 bits.
         .p2align    2
         .global caller_small_stack
         .type caller_small_stack, @function
@@ -54,9 +54,9 @@ caller_small_stack:
 # CHECK-NEXT:  addi 12, 1, -16416
 # CHECK-NEXT:  nop
 # CHECK-NEXT:  cmpld 7, 12, 0
-# CHECK-NEXT:  bt-  28, .+36
+# CHECK-NEXT:  bt-  28, 0x10010204
 
-# A caller that has a stack size that fits whithin 16 bits, but the adjusted
+# A caller that has a stack size that fits within 16 bits, but the adjusted
 # stack size after prologue adjustment now overflows 16 bits needing both addis
 # and addi instructions.
         .p2align    2
@@ -132,7 +132,7 @@ caller_large_stack:
 # CHECK-NEXT:  addis 12, 1, -1
 # CHECK-NEXT:  addi 12, 12, -16416
 # CHECK-NEXT:  cmpld 7, 12, 0
-# CHECK-NEXT:  bt-  28, .+44
+# CHECK-NEXT:  bt-  28, 0x100102bc
 
 # A caller with a stack size that is larger then 16 bits, but aligned such that
 # the addi instruction is unneeded.
@@ -174,7 +174,7 @@ caller_large_aligned_stack:
 # CHECK-NEXT:  addis 12, 1, -2
 # CHECK-NEXT:  addi 12, 12, -16384
 # CHECK-NEXT:  cmpld 7, 12, 0
-# CHECK-NEXT:  bt-  28, .+40
+# CHECK-NEXT:  bt-  28, 0x10010318
 
 # main only calls split-stack functions or __morestack so
 # there should be no adjustment of its split-stack prologue.
diff --git a/lld/test/ELF/threads.s b/lld/test/ELF/threads.s
index 6d4b36924031..7a9eeec00e08 100644
--- a/lld/test/ELF/threads.s
+++ b/lld/test/ELF/threads.s
@@ -1,3 +1,4 @@
+# REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
 
 ## A positive integer is allowed.
diff --git a/lld/test/ELF/undef-suggest-version.s b/lld/test/ELF/undef-suggest-version.s
new file mode 100644
index 000000000000..790b9fcab788
--- /dev/null
+++ b/lld/test/ELF/undef-suggest-version.s
@@ -0,0 +1,57 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: echo 'v1 {bar;};' > %t.ver
+# RUN: ld.lld -shared --version-script %t.ver %t.o -o %t.so
+
+## For an unversioned undefined symbol, check we can suggest the symbol with the
+## default version.
+# RUN: echo 'call bat' | llvm-mc -filetype=obj -triple=x86_64 - -o %tdef1.o
+# RUN: not ld.lld %t.so %tdef1.o -o /dev/null 2>&1 | FileCheck --check-prefix=DEFAULT1 %s
+
+# DEFAULT1:      error: undefined symbol: bat
+# DEFAULT1-NEXT: >>> referenced by {{.*}}.o:(.text+0x1)
+# DEFAULT1-NEXT: >>> did you mean: bar{{$}}
+# DEFAULT1-NEXT: >>> defined in: {{.*}}.so
+
+## For a versioned undefined symbol, check we can suggest the symbol with the
+## default version.
+# RUN: echo '.symver bar.v2,bar@v2; call bar.v2' | llvm-mc -filetype=obj -triple=x86_64 - -o %tdef2.o
+# RUN: not ld.lld %t.so %tdef2.o -o /dev/null 2>&1 | FileCheck --check-prefix=DEFAULT2 %s
+
+# DEFAULT2:      error: undefined symbol: bar@v2
+# DEFAULT2-NEXT: >>> referenced by {{.*}}.o:(.text+0x1)
+# DEFAULT2-NEXT: >>> did you mean: bar{{$}}
+# DEFAULT2-NEXT: >>> defined in: {{.*}}.so
+
+## For an unversioned undefined symbol, check we can suggest a symbol with
+## a non-default version.
+# RUN: echo 'call foo; call _Z3fooi' | llvm-mc -filetype=obj -triple=x86_64 - -o %thidden1.o
+# RUN: not ld.lld %t.so %thidden1.o -o /dev/null 2>&1 | FileCheck --check-prefix=HIDDEN1 %s
+
+# HIDDEN1:      error: undefined symbol: foo
+# HIDDEN1-NEXT: >>> referenced by {{.*}}.o:(.text+0x1)
+# HIDDEN1-NEXT: >>> did you mean: foo@v1
+# HIDDEN1-NEXT: >>> defined in: {{.*}}.so
+# HIDDEN1-EMPTY:
+# HIDDEN1-NEXT: error: undefined symbol: foo(int)
+# HIDDEN1-NEXT: >>> referenced by {{.*}}.o:(.text+0x6)
+# HIDDEN1-NEXT: >>> did you mean: foo(int)@v1
+# HIDDEN1-NEXT: >>> defined in: {{.*}}.so
+
+## For a versioned undefined symbol, check we can suggest a symbol with
+## a different version.
+# RUN: echo '.symver foo.v2,foo@v2; call foo.v2' | llvm-mc -filetype=obj -triple=x86_64 - -o %thidden2.o
+# RUN: not ld.lld %t.so %thidden2.o -o /dev/null 2>&1 | FileCheck --check-prefix=HIDDEN2 %s
+
+# HIDDEN2:      error: undefined symbol: foo@v2
+# HIDDEN2-NEXT: >>> referenced by {{.*}}.o:(.text+0x1)
+# HIDDEN2-NEXT: >>> did you mean: foo@v1
+# HIDDEN2-NEXT: >>> defined in: {{.*}}.so
+
+## %t.so exports bar@@v1 and two VERSYM_HIDDEN symbols: foo@v1 and _Z3fooi@v1.
+.globl foo.v1, _Z3fooi.v1, bar
+.symver foo.v1,foo@v1
+.symver _Z3fooi.v1,_Z3fooi@v1
+foo.v1:
+_Z3fooi.v1:
+bar:
diff --git a/lld/test/ELF/warn-backrefs.s b/lld/test/ELF/warn-backrefs.s
index 629bd3be2fd9..8a9ee2291ece 100644
--- a/lld/test/ELF/warn-backrefs.s
+++ b/lld/test/ELF/warn-backrefs.s
@@ -39,7 +39,7 @@
 # RUN: echo ".globl foo; foo: call bar" | llvm-mc -filetype=obj -triple=x86_64-unknown-linux - -o %t4.o
 # RUN: ld.lld --fatal-warnings --warn-backrefs %t1.o --start-lib %t3.o %t4.o --end-lib -o /dev/null
 
-# We don't report backward references to weak symbols as they can be overriden later.
+# We don't report backward references to weak symbols as they can be overridden later.
 # RUN: echo ".weak foo; foo:" | llvm-mc -filetype=obj -triple=x86_64-unknown-linux - -o %t5.o
 # RUN: ld.lld --fatal-warnings --warn-backrefs --start-lib %t5.o --end-lib %t1.o %t2.o -o /dev/null
 
diff --git a/lld/test/mach-o/error-simulator-vs-macosx.yaml b/lld/test/mach-o/error-simulator-vs-macosx.yaml
index 609eb3be43ab..fcf1774d760a 100644
--- a/lld/test/mach-o/error-simulator-vs-macosx.yaml
+++ b/lld/test/mach-o/error-simulator-vs-macosx.yaml
@@ -1,7 +1,7 @@
 # RUN: ld64.lld -arch i386 -macosx_version_min 10.8 %s %p/Inputs/hello-world-x86.yaml -o %t && llvm-nm -m %t | FileCheck %s
 # RUN: not ld64.lld -arch i386 -ios_simulator_version_min 5.0 %s %p/Inputs/hello-world-x86.yaml -o %t 2>&1 | FileCheck %s --check-prefix=ERROR
 #
-# Test that i386 can link with a macos version but gives an error with a simululator version.
+# Test that i386 can link with a macos version but gives an error with a simulator version.
 #
 
 --- !mach-o
diff --git a/lld/test/mach-o/parse-literals-error.yaml b/lld/test/mach-o/parse-literals-error.yaml
index 9dad0cbbf974..de8b47c53047 100644
--- a/lld/test/mach-o/parse-literals-error.yaml
+++ b/lld/test/mach-o/parse-literals-error.yaml
@@ -1,7 +1,7 @@
 # RUN: not ld64.lld -arch x86_64 -r -print_atoms %s -o %t 2> %t.err
 # RUN: FileCheck %s < %t.err
 #
-# Test for error if literal section is not correct size mulitple.
+# Test for error if literal section is not correct size multiple.
 #
 
 --- !mach-o
diff --git a/lld/test/wasm/early-exit-for-bad-paths.s b/lld/test/wasm/early-exit-for-bad-paths.s
new file mode 100644
index 000000000000..2866bfa62f86
--- /dev/null
+++ b/lld/test/wasm/early-exit-for-bad-paths.s
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o
+
+# RUN: not wasm-ld %t.o -o does_not_exist/output 2>&1 | \
+# RUN:   FileCheck %s -check-prefixes=NO-DIR-OUTPUT,CHECK
+# RUN: not wasm-ld %t.o -o %s/dir_is_a_file 2>&1 | \
+# RUN:   FileCheck %s -check-prefixes=DIR-IS-OUTPUT,CHECK
+# TODO(sbc): check similar check for -Map file once we add that option
+
+# NO-DIR-OUTPUT: error: cannot open output file does_not_exist/output:
+# DIR-IS-OUTPUT: error: cannot open output file {{.*}}/dir_is_a_file:
+
+# We should exit before doing the actual link. If an undefined symbol error is
+# discovered we haven't bailed out early as expected.
+# CHECK-NOT: undefined_symbol
+
+# RUN: not wasm-ld %t.o -o / 2>&1 | FileCheck %s -check-prefixes=ROOT,CHECK
+# ROOT: error: cannot open output file /
+
+_start:
+    .functype _start () -> ()
+    call undefined_symbol
+    end_function
diff --git a/lld/test/wasm/entry-signature.ll b/lld/test/wasm/entry-signature.ll
index 8e245b14e964..f7f3d481acfc 100644
--- a/lld/test/wasm/entry-signature.ll
+++ b/lld/test/wasm/entry-signature.ll
@@ -1,4 +1,4 @@
-; Verify that the entry point signauture can be flexible.
+; Verify that the entry point signature can be flexible.
 ; RUN: llc -filetype=obj %s -o %t.o
 ; RUN: wasm-ld -o %t1.wasm %t.o
 
diff --git a/lld/test/wasm/export-optional-lazy.ll b/lld/test/wasm/export-optional-lazy.ll
index 960e71c6ae6c..c37a3e5183eb 100644
--- a/lld/test/wasm/export-optional-lazy.ll
+++ b/lld/test/wasm/export-optional-lazy.ll
@@ -1,7 +1,7 @@
 ; Optional linker-synthetic symbols are only created if they are undefined
 ; in the final output.
 ; This test is for a regression where an explicit --export of an lazy archive
-; symbol caused an undefined referece to an optional symbol to occur *after*
+; symbol caused an undefined reference to an optional symbol to occur *after*
 ; the optional symbols were created.
 
 ; RUN: llc -filetype=obj %s -o %t.o
diff --git a/lld/test/wasm/lto/incompatible.ll b/lld/test/wasm/lto/incompatible.ll
index 6f7c154e959c..335756420639 100644
--- a/lld/test/wasm/lto/incompatible.ll
+++ b/lld/test/wasm/lto/incompatible.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: x86
 ; RUN: llvm-as %s -o %t.bc
-; RUN: not wasm-ld %t.bc -o out.wasm 2>&1 | FileCheck %s
+; RUN: not wasm-ld %t.bc -o %t.wasm 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/wasm/lto/opt-level.ll b/lld/test/wasm/lto/opt-level.ll
index f6156e725c7d..d2d552728b16 100644
--- a/lld/test/wasm/lto/opt-level.ll
+++ b/lld/test/wasm/lto/opt-level.ll
@@ -11,7 +11,7 @@
 ; RUN:   FileCheck --check-prefix=INVALID %s
 ; INVALID: invalid optimization level for LTO: 6
 
-; RUN: not wasm-ld -o %t3 -m elf_x86_64 -e main --lto-O-1 %t.o 2>&1 | \
+; RUN: not wasm-ld -o %t3 -e main --lto-O-1 %t.o 2>&1 | \
 ; RUN:   FileCheck --check-prefix=INVALIDNEGATIVE %s
 ; INVALIDNEGATIVE: invalid optimization level for LTO: 4294967295
 
diff --git a/lld/test/wasm/lto/signature-mismatch.ll b/lld/test/wasm/lto/signature-mismatch.ll
index e12d91866023..cf1a998826fc 100644
--- a/lld/test/wasm/lto/signature-mismatch.ll
+++ b/lld/test/wasm/lto/signature-mismatch.ll
@@ -3,7 +3,7 @@
 ; RUN: not wasm-ld --fatal-warnings %t.o %t1.o -o %t.wasm 2>&1 | FileCheck %s
 
 ; Test that functions defined in bitcode correctly report signature
-; mistmaches with existing undefined sybmols in normal objects.
+; mismatches with existing undefined sybmols in normal objects.
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/lld/test/wasm/lto/undef.ll b/lld/test/wasm/lto/undef.ll
index 65e8e4642d5e..a5477cb3c3bd 100644
--- a/lld/test/wasm/lto/undef.ll
+++ b/lld/test/wasm/lto/undef.ll
@@ -8,7 +8,7 @@ target triple = "wasm32-unknown-unknown"
 declare i32 @bar()
 
 ; Symbols such as foo which are only called indirectly are handled slightly
-; differently with resepect to signature checking.
+; differently with respect to signature checking.
 declare i32 @foo()
 
 @ptr = global i8* bitcast (i32 ()* @foo to i8*), align 8
diff --git a/lld/test/wasm/responsefile.test b/lld/test/wasm/responsefile.test
index ba2afd0cc8fd..85ac41f93f2b 100644
--- a/lld/test/wasm/responsefile.test
+++ b/lld/test/wasm/responsefile.test
@@ -10,11 +10,11 @@ RUN:     FileCheck --check-prefix=INVRSP %s
 INVRSP: invalid response file quoting: foobar
 
 RUN: echo "blah\foo" > %t.rsp
-RUN: not wasm-ld --rsp-quoting=windows @%t.rsp 2>&1 | \
+RUN: not wasm-ld -o a.out --rsp-quoting=windows @%t.rsp 2>&1 | \
 RUN:     FileCheck --check-prefix=WINRSP %s
 WINRSP: error: cannot open blah\foo:
 
 RUN: echo "blah\foo" > %t.rsp
-RUN: not wasm-ld --rsp-quoting=posix @%t.rsp 2>&1 | \
+RUN: not wasm-ld -o a.out --rsp-quoting=posix @%t.rsp 2>&1 | \
 RUN:     FileCheck --check-prefix=POSRSP %s
 POSRSP: error: cannot open blahfoo:
diff --git a/lld/test/wasm/signature-mismatch-unknown.ll b/lld/test/wasm/signature-mismatch-unknown.ll
index b354ed12207d..9bbad0065dcd 100644
--- a/lld/test/wasm/signature-mismatch-unknown.ll
+++ b/lld/test/wasm/signature-mismatch-unknown.ll
@@ -4,7 +4,7 @@
 ; RUN: wasm-ld --fatal-warnings -o %t.wasm %t.main.o %t.ret32.o
 
 ; Also test the case where there are two different object files that contains
-; referneces ret32:
+; references ret32:
 ; %t.main.o: Does not call ret32 directly; used the wrong signature.
 ; %t.call-ret32.o: Calls ret32 directly; uses the correct signature.
 ; RUN: llc -filetype=obj %p/Inputs/call-ret32.ll -o %t.call-ret32.o
diff --git a/lld/test/wasm/undefined-entry.test b/lld/test/wasm/undefined-entry.test
index a36212f2c9e3..3106a76510f4 100644
--- a/lld/test/wasm/undefined-entry.test
+++ b/lld/test/wasm/undefined-entry.test
@@ -3,7 +3,7 @@ RUN: not wasm-ld -o %t.wasm %t.ret32.o 2>&1 | FileCheck %s
 RUN: not wasm-ld --allow-undefined -o %t.wasm %t.ret32.o 2>&1 | FileCheck %s
 RUN: not wasm-ld -entry=foo -o %t.wasm %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-CUSTOM
 
-CHECK: error: entry symbol not defined (pass --no-entry to supress): _start
-CHECK-CUSTOM: error: entry symbol not defined (pass --no-entry to supress): foo
+CHECK: error: entry symbol not defined (pass --no-entry to suppress): _start
+CHECK-CUSTOM: error: entry symbol not defined (pass --no-entry to suppress): foo
 
 RUN: wasm-ld --no-entry -o %t.wasm %t.ret32.o
diff --git a/lld/test/wasm/visibility-hidden.ll b/lld/test/wasm/visibility-hidden.ll
index f5731c4e964a..99acd5651f7b 100644
--- a/lld/test/wasm/visibility-hidden.ll
+++ b/lld/test/wasm/visibility-hidden.ll
@@ -3,12 +3,12 @@
 ; RUN: rm -f %t2.a
 ; RUN: llvm-ar rcs %t2.a %t2.o
 
-; Test that symbols with hidden visitiblity are not export, even with
+; Test that symbols with hidden visibility are not export, even with
 ; --export-dynamic
 ; RUN: wasm-ld --export-dynamic %t.o %t2.a -o %t.wasm
 ; RUN: obj2yaml %t.wasm | FileCheck %s
 
-; Test that symbols with default visitiblity are not exported without
+; Test that symbols with default visibility are not exported without
 ; --export-dynamic
 ; RUN: wasm-ld %t.o %t2.a -o %t.nodef.wasm
 ; RUN: obj2yaml %t.nodef.wasm | FileCheck %s -check-prefix=NO-DEFAULT
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index d0b01d5ccef0..b6cd879e89c6 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -15,6 +15,7 @@
 #include "Writer.h"
 #include "lld/Common/Args.h"
 #include "lld/Common/ErrorHandler.h"
+#include "lld/Common/Filesystem.h"
 #include "lld/Common/Memory.h"
 #include "lld/Common/Reproduce.h"
 #include "lld/Common/Strings.h"
@@ -304,6 +305,8 @@ void LinkerDriver::createFiles(opt::InputArgList &args) {
       break;
     }
   }
+  if (files.empty() && errorCount() == 0)
+    error("no input files");
 }
 
 static StringRef getEntry(opt::InputArgList &args) {
@@ -728,16 +731,27 @@ void LinkerDriver::link(ArrayRef<const char *> argsArr) {
   errorHandler().errorLimit = args::getInteger(args, OPT_error_limit, 20);
 
   readConfigs(args);
+
+  createFiles(args);
+  if (errorCount())
+    return;
+
   setConfigs();
   checkOptions(args);
+  if (errorCount())
+    return;
 
   if (auto *arg = args.getLastArg(OPT_allow_undefined_file))
     readImportFile(arg->getValue());
 
-  if (!args.hasArg(OPT_INPUT)) {
-    error("no input files");
+  // Fail early if the output file or map file is not writable. If a user has a
+  // long link, e.g. due to a large LTO link, they do not wish to run it and
+  // find that it failed because there was a mistake in their command-line.
+  if (auto e = tryCreateFile(config->outputFile))
+    error("cannot open output file " + config->outputFile + ": " + e.message());
+  // TODO(sbc): add check for map file too once we add support for that.
+  if (errorCount())
     return;
-  }
 
   // Handle --trace-symbol.
   for (auto *arg : args.filtered(OPT_trace_symbol))
@@ -748,10 +762,6 @@ void LinkerDriver::link(ArrayRef<const char *> argsArr) {
 
   createSyntheticSymbols();
 
-  createFiles(args);
-  if (errorCount())
-    return;
-
   // Add all files to the symbol table. This will add almost all
   // symbols that we need to the symbol table.
   for (InputFile *f : files)
@@ -774,7 +784,7 @@ void LinkerDriver::link(ArrayRef<const char *> argsArr) {
     if (entrySym && entrySym->isDefined())
       entrySym->forceExport = true;
     else
-      error("entry symbol not defined (pass --no-entry to supress): " +
+      error("entry symbol not defined (pass --no-entry to suppress): " +
             config->entry);
   }
 
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index dc095adf6dff..99d18178eeac 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -138,7 +138,7 @@ static void reportTypeError(const Symbol *existing, const InputFile *file,
 }
 
 // Check the type of new symbol matches that of the symbol is replacing.
-// Returns true if the function types match, false is there is a singature
+// Returns true if the function types match, false is there is a signature
 // mismatch.
 static bool signatureMatches(FunctionSymbol *existing,
                              const WasmSignature *newSig) {
@@ -279,7 +279,7 @@ Symbol *SymbolTable::addDefinedFunction(StringRef name, uint32_t flags,
   std::tie(s, wasInserted) = insert(name, file);
 
   auto replaceSym = [&](Symbol *sym) {
-    // If the new defined function doesn't have signture (i.e. bitcode
+    // If the new defined function doesn't have signature (i.e. bitcode
     // functions) but the old symbol does, then preserve the old signature
     const WasmSignature *oldSig = s->getSignature();
     auto* newSym = replaceSymbol<DefinedFunction>(sym, name, flags, file, function);
diff --git a/lld/wasm/SymbolTable.h b/lld/wasm/SymbolTable.h
index 9803ad439f9c..522ea46b38bc 100644
--- a/lld/wasm/SymbolTable.h
+++ b/lld/wasm/SymbolTable.h
@@ -108,7 +108,7 @@ class SymbolTable {
   llvm::DenseMap<llvm::CachedHashStringRef, int> symMap;
   std::vector<Symbol *> symVector;
 
-  // For certain symbols types, e.g. function symbols, we allow for muliple
+  // For certain symbols types, e.g. function symbols, we allow for multiple
   // variants of the same symbol with different signatures.
   llvm::DenseMap<llvm::CachedHashStringRef, std::vector<Symbol *>> symVariants;
 
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index 349fc651533e..7112db6b0826 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -156,7 +156,7 @@ void Symbol::setGOTIndex(uint32_t index) {
   LLVM_DEBUG(dbgs() << "setGOTIndex " << name << " -> " << index << "\n");
   assert(gotIndex == INVALID_INDEX);
   if (config->isPic) {
-    // Any symbol that is assigned a GOT entry must be exported othewise the
+    // Any symbol that is assigned a GOT entry must be exported otherwise the
     // dynamic linker won't be able create the entry that contains it.
     forceExport = true;
   }
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 11836e11f3ec..3400cde1c7e2 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -415,7 +415,7 @@ class LazySymbol : public Symbol {
 
   // Lazy symbols can have a signature because they can replace an
   // UndefinedFunction which which case we need to be able to preserve the
-  // signture.
+  // signature.
   // TODO(sbc): This repetition of the signature field is inelegant.  Revisit
   // the use of class hierarchy to represent symbol taxonomy.
   const WasmSignature *signature = nullptr;
diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index d18027c7248c..dfff30029168 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -196,10 +196,11 @@ class Type : public std::enable_shared_from_this<Type>, public UserID {
 
   uint32_t GetEncodingMask();
 
+  typedef uint32_t Payload;
   /// Return the language-specific payload.
-  uint32_t GetPayload() { return m_payload; }
+  Payload GetPayload() { return m_payload; }
   /// Return the language-specific payload.
-  void SetPayload(uint32_t opaque_payload) { m_payload = opaque_payload; }
+  void SetPayload(Payload opaque_payload) { m_payload = opaque_payload; }
 
 protected:
   ConstString m_name;
@@ -215,7 +216,7 @@ class Type : public std::enable_shared_from_this<Type>, public UserID {
   CompilerType m_compiler_type;
   ResolveState m_compiler_type_resolve_state;
   /// Language-specific flags.
-  uint32_t m_payload;
+  Payload m_payload;
 
   Type *GetEncodingType();
 
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 966d460ea13d..5058594505f5 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -2414,9 +2414,12 @@ def expect_expr(
 
         # Set the usual default options for normal expressions.
         options.SetIgnoreBreakpoints(True)
-        options.SetLanguage(frame.GuessLanguage())
 
-        eval_result = frame.EvaluateExpression(expr, options)
+        if self.frame().IsValid():
+          options.SetLanguage(frame.GuessLanguage())
+          eval_result = self.frame().EvaluateExpression(expr, options)
+        else:
+          eval_result = self.target().EvaluateExpression(expr, options)
 
         if not eval_result.GetError().Success():
             self.assertTrue(eval_result.GetError().Success(),
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index b2485393cd6a..8a6af3881a0f 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -501,6 +501,8 @@ Status Host::RunShellCommand(const Args &args, const FileSpec &working_dir,
     launch_info.SetArguments(args, first_arg_is_executable);
   }
 
+  launch_info.GetEnvironment() = Host::GetEnvironment();
+
   if (working_dir)
     launch_info.SetWorkingDirectory(working_dir);
   llvm::SmallString<64> output_file_path;
diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm
index eba3060f8ec6..045ba7f3671f 100644
--- a/lldb/source/Host/macosx/objcxx/Host.mm
+++ b/lldb/source/Host/macosx/objcxx/Host.mm
@@ -9,13 +9,9 @@
 #include "lldb/Host/Host.h"
 
 #include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
 
-// On device doesn't have supporty for XPC.
-#if defined(__APPLE__) && (defined(__arm64__) || defined(__aarch64__))
-#define NO_XPC_SERVICES 1
-#endif
-
-#if !defined(NO_XPC_SERVICES)
+#if TARGET_OS_OSX
 #define __XPC_PRIVATE_H__
 #include <xpc/xpc.h>
 
@@ -135,6 +131,8 @@
   return false;
 }
 
+#if TARGET_OS_OSX
+
 static void *AcceptPIDFromInferior(void *arg) {
   const char *connect_url = (const char *)arg;
   ConnectionFileDescriptor file_conn;
@@ -153,8 +151,6 @@
   return NULL;
 }
 
-#if !defined(__arm__) && !defined(__arm64__) && !defined(__aarch64__)
-
 const char *applscript_in_new_tty = "tell application \"Terminal\"\n"
                                     "   activate\n"
                                     "	do script \"/bin/bash -c '%s';exit\"\n"
@@ -307,13 +303,13 @@ repeat with the_window in (get windows)\n\
   return error;
 }
 
-#endif // #if !defined(__arm__) && !defined(__arm64__) && !defined(__aarch64__)
+#endif // TARGET_OS_OSX
 
 bool Host::OpenFileInExternalEditor(const FileSpec &file_spec,
                                     uint32_t line_no) {
-#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
+#if !TARGET_OS_OSX
   return false;
-#else
+#else // !TARGET_OS_OSX
   // We attach this to an 'odoc' event to specify a particular selection
   typedef struct {
     int16_t reserved0; // must be zero
@@ -404,7 +400,7 @@ repeat with the_window in (get windows)\n\
   }
 
   return true;
-#endif // #if !defined(__arm__) && !defined(__arm64__) && !defined(__aarch64__)
+#endif // TARGET_OS_OSX
 }
 
 Environment Host::GetEnvironment() { return Environment(*_NSGetEnviron()); }
@@ -689,7 +685,7 @@ static bool GetMacOSXProcessUserAndGroup(ProcessInstanceInfo &process_info) {
   return false;
 }
 
-#if !NO_XPC_SERVICES
+#if TARGET_OS_OSX
 static void PackageXPCArguments(xpc_object_t message, const char *prefix,
                                 const Args &args) {
   size_t count = args.GetArgumentCount();
@@ -841,7 +837,7 @@ static short GetPosixspawnFlags(const ProcessLaunchInfo &launch_info) {
 static Status LaunchProcessXPC(const char *exe_path,
                                ProcessLaunchInfo &launch_info,
                                lldb::pid_t &pid) {
-#if !NO_XPC_SERVICES
+#if TARGET_OS_OSX
   Status error = getXPCAuthorization(launch_info);
   if (error.Fail())
     return error;
@@ -1194,7 +1190,7 @@ static Status LaunchProcessPosixSpawn(const char *exe_path,
 static bool ShouldLaunchUsingXPC(ProcessLaunchInfo &launch_info) {
   bool result = false;
 
-#if !NO_XPC_SERVICES
+#if TARGET_OS_OSX
   bool launchingAsRoot = launch_info.GetUserID() == 0;
   bool currentUserIsRoot = HostInfo::GetEffectiveUserID() == 0;
 
@@ -1226,7 +1222,7 @@ static bool ShouldLaunchUsingXPC(ProcessLaunchInfo &launch_info) {
   }
 
   if (launch_info.GetFlags().Test(eLaunchFlagLaunchInTTY)) {
-#if !defined(__arm__) && !defined(__arm64__) && !defined(__aarch64__)
+#if TARGET_OS_OSX
     return LaunchInNewTerminalWithAppleScript(exe_spec.GetPath().c_str(),
                                               launch_info);
 #else
diff --git a/lldb/source/Interpreter/OptionValuePathMappings.cpp b/lldb/source/Interpreter/OptionValuePathMappings.cpp
index ebff5c4dca3e..2784279579f0 100644
--- a/lldb/source/Interpreter/OptionValuePathMappings.cpp
+++ b/lldb/source/Interpreter/OptionValuePathMappings.cpp
@@ -61,7 +61,7 @@ Status OptionValuePathMappings::SetValueFromString(llvm::StringRef value,
             count);
       } else {
         bool changed = false;
-        for (size_t i = 1; i < argc; i += 2, ++idx) {
+        for (size_t i = 1; i < argc; i += 2) {
           const char *orginal_path = args.GetArgumentAtIndex(i);
           const char *replace_path = args.GetArgumentAtIndex(i + 1);
           if (VerifyPathExists(replace_path)) {
@@ -70,9 +70,13 @@ Status OptionValuePathMappings::SetValueFromString(llvm::StringRef value,
             if (!m_path_mappings.Replace(a, b, idx, m_notify_changes))
               m_path_mappings.Append(a, b, m_notify_changes);
             changed = true;
+            idx++;
           } else {
+            std::string previousError =
+                error.Fail() ? std::string(error.AsCString()) + "\n" : "";
             error.SetErrorStringWithFormat(
-                "the replacement path doesn't exist: \"%s\"", replace_path);
+                "%sthe replacement path doesn't exist: \"%s\"",
+                previousError.c_str(), replace_path);
             break;
           }
         }
@@ -109,9 +113,11 @@ Status OptionValuePathMappings::SetValueFromString(llvm::StringRef value,
           m_value_was_set = true;
           changed = true;
         } else {
+          std::string previousError =
+              error.Fail() ? std::string(error.AsCString()) + "\n" : "";
           error.SetErrorStringWithFormat(
-              "the replacement path doesn't exist: \"%s\"", replace_path);
-          break;
+              "%sthe replacement path doesn't exist: \"%s\"",
+              previousError.c_str(), replace_path);
         }
       }
       if (changed)
@@ -135,7 +141,7 @@ Status OptionValuePathMappings::SetValueFromString(llvm::StringRef value,
         bool changed = false;
         if (op == eVarSetOperationInsertAfter)
           ++idx;
-        for (size_t i = 1; i < argc; i += 2, ++idx) {
+        for (size_t i = 1; i < argc; i += 2) {
           const char *orginal_path = args.GetArgumentAtIndex(i);
           const char *replace_path = args.GetArgumentAtIndex(i + 1);
           if (VerifyPathExists(replace_path)) {
@@ -143,9 +149,13 @@ Status OptionValuePathMappings::SetValueFromString(llvm::StringRef value,
             ConstString b(replace_path);
             m_path_mappings.Insert(a, b, idx, m_notify_changes);
             changed = true;
+            idx++;
           } else {
+            std::string previousError =
+                error.Fail() ? std::string(error.AsCString()) + "\n" : "";
             error.SetErrorStringWithFormat(
-                "the replacement path doesn't exist: \"%s\"", replace_path);
+                "%sthe replacement path doesn't exist: \"%s\"",
+                previousError.c_str(), replace_path);
             break;
           }
         }
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 38b4472f50a7..385b291df709 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -782,6 +782,9 @@ std::unique_ptr<CallFrameInfo> ObjectFilePECOFF::CreateCallFrameInfo() {
   if (!data_dir_exception.vmaddr)
     return {};
 
+  if (m_coff_header.machine != llvm::COFF::IMAGE_FILE_MACHINE_AMD64)
+    return {};
+
   return std::make_unique<PECallFrameInfo>(*this, data_dir_exception.vmaddr,
                                            data_dir_exception.vmsize);
 }
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index a609b06d4e13..f355681f2679 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -56,12 +56,12 @@ class Declaration;
 /// The implementation of lldb::Type's m_payload field for TypeSystemClang.
 class TypePayloadClang {
   /// Layout: bit 31 ... IsCompleteObjCClass.
-  uint32_t m_payload = 0;
+  Type::Payload m_payload = 0;
 public:
   TypePayloadClang() = default;
   explicit TypePayloadClang(bool is_complete_objc_class);
   explicit TypePayloadClang(uint32_t opaque_payload) : m_payload(opaque_payload) {}
-  operator uint32_t() { return m_payload; }
+  operator Type::Payload() { return m_payload; }
 
   static constexpr unsigned ObjCClassBit = 1 << 31;
   bool IsCompleteObjCClass() { return Flags(m_payload).Test(ObjCClassBit); }
diff --git a/lldb/test/API/commands/expression/anonymous-struct/TestCallUserAnonTypedef.py b/lldb/test/API/commands/expression/anonymous-struct/TestCallUserAnonTypedef.py
index b6e035752896..1a4e16610042 100644
--- a/lldb/test/API/commands/expression/anonymous-struct/TestCallUserAnonTypedef.py
+++ b/lldb/test/API/commands/expression/anonymous-struct/TestCallUserAnonTypedef.py
@@ -6,22 +6,15 @@
 Ticket: https://llvm.org/bugs/show_bug.cgi?id=26790
 """
 
-
 import lldb
 
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-
 class TestExprLookupAnonStructTypedef(TestBase):
     mydir = TestBase.compute_mydir(__file__)
 
-    def setUp(self):
-        TestBase.setUp(self)
-        # Find the breakpoint
-        self.line = line_number('main.cpp', '// lldb testsuite break')
-
     @expectedFailureAll(
         oslist=['linux'],
         archs=['arm'],
@@ -29,16 +22,5 @@ def setUp(self):
     def test(self):
         """Test typedeffed untagged struct arguments for function call expressions"""
         self.build()
-
-        self.runCmd("file "+self.getBuildArtifact("a.out"),
-                    CURRENT_EXECUTABLE_SET)
-        lldbutil.run_break_set_by_file_and_line(
-            self,
-            "main.cpp",
-            self.line,
-            num_expected_locations=-1,
-            loc_exact=True
-        )
-
-        self.runCmd("run", RUN_SUCCEEDED)
+        lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))
         self.expect_expr("multiply(&s)", result_type="double", result_value="1")
diff --git a/lldb/test/API/commands/expression/anonymous-struct/main.cpp b/lldb/test/API/commands/expression/anonymous-struct/main.cpp
index 5b170c5f943a..d6366e787152 100644
--- a/lldb/test/API/commands/expression/anonymous-struct/main.cpp
+++ b/lldb/test/API/commands/expression/anonymous-struct/main.cpp
@@ -1,26 +1,17 @@
-#include <tgmath.h>
-
 typedef struct {
-    float f;
-    int i;
+  float f;
+  int i;
 } my_untagged_struct;
 
-double multiply(my_untagged_struct *s)
-{
-    return s->f * s->i;
-}
+double multiply(my_untagged_struct *s) { return s->f * s->i; }
 
-double multiply(my_untagged_struct *s, int x)
-{
-    return multiply(s) * x;
-}
+double multiply(my_untagged_struct *s, int x) { return multiply(s) * x; }
 
-int main(int argc, char **argv)
-{
-    my_untagged_struct s = {
-        .f = (float)argc,
-        .i = argc,
-    };
-    // lldb testsuite break
-    return !(multiply(&s, argc) == pow(argc, 3));
+int main(int argc, char **argv) {
+  my_untagged_struct s = {
+      .f = (float)argc,
+      .i = argc,
+  };
+  // break here
+  return multiply(&s, argc) > 0;
 }
diff --git a/lldb/test/API/commands/expression/call-function/TestCallBuiltinFunction.py b/lldb/test/API/commands/expression/call-function/TestCallBuiltinFunction.py
index 31478884ad7d..55ba2717c013 100644
--- a/lldb/test/API/commands/expression/call-function/TestCallBuiltinFunction.py
+++ b/lldb/test/API/commands/expression/call-function/TestCallBuiltinFunction.py
@@ -17,24 +17,10 @@ class ExprCommandCallBuiltinFunction(TestBase):
     # Builtins are expanded by Clang, so debug info shouldn't matter.
     NO_DEBUG_INFO_TESTCASE = True
 
-    def setUp(self):
-        TestBase.setUp(self)
-        # Find the line number to break for main.c.
-        self.line = line_number(
-            'main.cpp',
-            '// Please test these expressions while stopped at this line:')
-
     def test(self):
         self.build()
 
-        # Set breakpoint in main and run exe
-        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
-        lldbutil.run_break_set_by_file_and_line(
-            self, "main.cpp", self.line, num_expected_locations=-1, loc_exact=True)
-
-        self.runCmd("run", RUN_SUCCEEDED)
-
-        # Test different builtin functions.
+        target = self.dbg.CreateTarget(self.getBuildArtifact("a.out"))
 
         self.expect_expr("__builtin_isinf(0.0f)", result_type="int", result_value="0")
         self.expect_expr("__builtin_isnormal(0.0f)", result_type="int", result_value="0")
diff --git a/lldb/test/API/commands/expression/call-function/TestCallStdStringFunction.py b/lldb/test/API/commands/expression/call-function/TestCallStdStringFunction.py
index 261e702fa59a..f94bcae34cf9 100644
--- a/lldb/test/API/commands/expression/call-function/TestCallStdStringFunction.py
+++ b/lldb/test/API/commands/expression/call-function/TestCallStdStringFunction.py
@@ -2,26 +2,15 @@
 Test calling std::String member functions.
 """
 
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-
 class ExprCommandCallFunctionTestCase(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
-    def setUp(self):
-        # Call super's setUp().
-        TestBase.setUp(self)
-        # Find the line number to break for main.c.
-        self.line = line_number(
-            'main.cpp',
-            '// Please test these expressions while stopped at this line:')
-
     @expectedFailureAll(
         compiler="icc",
         bugnumber="llvm.org/pr14437, fails with ICC 13.1")
@@ -29,15 +18,7 @@ def setUp(self):
     def test_with(self):
         """Test calling std::String member function."""
         self.build()
-        self.runCmd("file " + self.getBuildArtifact("a.out"),
-                    CURRENT_EXECUTABLE_SET)
-
-        # Some versions of GCC encode two locations for the 'return' statement
-        # in main.cpp
-        lldbutil.run_break_set_by_file_and_line(
-            self, "main.cpp", self.line, num_expected_locations=-1, loc_exact=True)
-
-        self.runCmd("run", RUN_SUCCEEDED)
+        lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))
 
         self.expect("print str",
                     substrs=['Hello world'])
diff --git a/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py b/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py
index 0f0f1a54e31c..1191176aa706 100644
--- a/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py
+++ b/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py
@@ -2,13 +2,10 @@
 Test calling a function, stopping in the call, continue and gather the result on stop.
 """
 
-
-
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-
 class ExprCommandCallStopContinueTestCase(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
@@ -17,27 +14,16 @@ def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
         # Find the line number to break for main.c.
-        self.line = line_number(
-            'main.cpp',
-            '// Please test these expressions while stopped at this line:')
-        self.func_line = line_number('main.cpp', '{5, "five"}')
 
     def test(self):
         """Test gathering result from interrupted function call."""
         self.build()
-        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
-
-        # Some versions of GCC encode two locations for the 'return' statement
-        # in main.cpp
-        lldbutil.run_break_set_by_file_and_line(
-            self, "main.cpp", self.line, num_expected_locations=-1, loc_exact=True)
-
-        self.runCmd("run", RUN_SUCCEEDED)
+        lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))
 
         lldbutil.run_break_set_by_file_and_line(
             self,
             "main.cpp",
-            self.func_line,
+            line_number('main.cpp', '{5, "five"}'),
             num_expected_locations=-1,
             loc_exact=True)
 
diff --git a/lldb/test/API/commands/expression/call-function/TestCallUserDefinedFunction.py b/lldb/test/API/commands/expression/call-function/TestCallUserDefinedFunction.py
index 98cd0f24f36c..edaa76174b47 100644
--- a/lldb/test/API/commands/expression/call-function/TestCallUserDefinedFunction.py
+++ b/lldb/test/API/commands/expression/call-function/TestCallUserDefinedFunction.py
@@ -7,36 +7,19 @@
 
 """
 
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-
 class ExprCommandCallUserDefinedFunction(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
-    def setUp(self):
-        # Call super's setUp().
-        TestBase.setUp(self)
-        # Find the line number to break for main.c.
-        self.line = line_number(
-            'main.cpp',
-            '// Please test these expressions while stopped at this line:')
-
     def test(self):
         """Test return values of user defined function calls."""
         self.build()
-
-        # Set breakpoint in main and run exe
-        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
-        lldbutil.run_break_set_by_file_and_line(
-            self, "main.cpp", self.line, num_expected_locations=-1, loc_exact=True)
-
-        self.runCmd("run", RUN_SUCCEEDED)
+        lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))
 
         # Test recursive function call.
         self.expect_expr("fib(5)", result_type="unsigned int", result_value="5")
diff --git a/lldb/test/API/commands/expression/call-function/main.cpp b/lldb/test/API/commands/expression/call-function/main.cpp
index cc5f52dbf567..a383ce5c22a0 100644
--- a/lldb/test/API/commands/expression/call-function/main.cpp
+++ b/lldb/test/API/commands/expression/call-function/main.cpp
@@ -1,53 +1,34 @@
-#include <iostream>
-#include <string>
 #include <cstring>
+#include <string>
 
-struct Five
-{
-    int number;
-    const char *name;
+struct Five {
+  int number;
+  const char *name;
 };
 
-Five
-returnsFive()
-{
-    Five my_five = {5, "five"};
-    return my_five;
+Five returnsFive() {
+  Five my_five = {5, "five"};
+  return my_five;
 }
 
-unsigned int
-fib(unsigned int n)
-{
-    if (n < 2)
-        return n;
-    else
-        return fib(n - 1) + fib(n - 2);
+unsigned int fib(unsigned int n) {
+  if (n < 2)
+    return n;
+  else
+    return fib(n - 1) + fib(n - 2);
 }
 
-int
-add(int a, int b)
-{
-    return a + b;
-}
+int add(int a, int b) { return a + b; }
 
-bool
-stringCompare(const char *str)
-{
-    if (strcmp( str, "Hello world" ) == 0)
-        return true;
-    else
-        return false;
+bool stringCompare(const char *str) {
+  if (strcmp(str, "Hello world") == 0)
+    return true;
+  else
+    return false;
 }
 
-int main (int argc, char const *argv[])
-{
-    std::string str = "Hello world";
-    std::cout << str << std::endl;
-    std::cout << str.c_str() << std::endl;
-    Five main_five = returnsFive();
-#if 0
-    print str
-    print str.c_str()
-#endif
-    return 0; // Please test these expressions while stopped at this line:
+int main(int argc, char const *argv[]) {
+  std::string str = "Hello world";
+  Five main_five = returnsFive();
+  return strlen(str.c_str()); // break here
 }
diff --git a/lldb/test/API/commands/expression/char/TestExprsChar.py b/lldb/test/API/commands/expression/char/TestExprsChar.py
index f1fa78053846..a1a4568aa92e 100644
--- a/lldb/test/API/commands/expression/char/TestExprsChar.py
+++ b/lldb/test/API/commands/expression/char/TestExprsChar.py
@@ -1,44 +1,21 @@
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-
 class ExprCharTestCase(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
-    def setUp(self):
-        # Call super's setUp().
-        TestBase.setUp(self)
-
-        self.main_source = "main.cpp"
-        self.main_source_spec = lldb.SBFileSpec(self.main_source)
-
     def do_test(self, dictionary=None):
         """These basic expression commands should work as expected."""
         self.build(dictionary=dictionary)
 
-        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(self,
-                                          '// Break here', self.main_source_spec)
-        frame = thread.GetFrameAtIndex(0)
-
-        value = frame.EvaluateExpression("foo(c)")
-        self.assertTrue(value.IsValid())
-        self.assertTrue(value.GetError().Success())
-        self.assertEqual(value.GetValueAsSigned(0), 1)
-
-        value = frame.EvaluateExpression("foo(sc)")
-        self.assertTrue(value.IsValid())
-        self.assertTrue(value.GetError().Success())
-        self.assertEqual(value.GetValueAsSigned(0), 2)
+        lldbutil.run_to_source_breakpoint(self, '// Break here', lldb.SBFileSpec("main.cpp"))
 
-        value = frame.EvaluateExpression("foo(uc)")
-        self.assertTrue(value.IsValid())
-        self.assertTrue(value.GetError().Success())
-        self.assertEqual(value.GetValueAsSigned(0), 3)
+        self.expect_expr("foo(c)", result_value="1")
+        self.expect_expr("foo(sc)", result_value="2")
+        self.expect_expr("foo(uc)", result_value="3")
 
     def test_default_char(self):
         self.do_test()
diff --git a/lldb/test/API/commands/expression/xvalue/TestXValuePrinting.py b/lldb/test/API/commands/expression/xvalue/TestXValuePrinting.py
index 3a394d781f0a..f5122b84839a 100644
--- a/lldb/test/API/commands/expression/xvalue/TestXValuePrinting.py
+++ b/lldb/test/API/commands/expression/xvalue/TestXValuePrinting.py
@@ -1,36 +1,15 @@
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-
 class ExprXValuePrintingTestCase(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
-    def setUp(self):
-        # Call super's setUp().
-        TestBase.setUp(self)
-
-        self.main_source = "main.cpp"
-        self.main_source_spec = lldb.SBFileSpec(self.main_source)
-
-    def do_test(self, dictionary=None):
-        """Printing an xvalue should work."""
-        self.build(dictionary=dictionary)
-
-        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(self,
-                                          '// Break here', self.main_source_spec)
-        frame = thread.GetFrameAtIndex(0)
-
-        value = frame.EvaluateExpression("foo().data")
-        self.assertTrue(value.IsValid())
-        self.assertTrue(value.GetError().Success())
-        self.assertEqual(value.GetValueAsSigned(), 1234)
-
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr21765")
     def test(self):
-        self.do_test()
-
+        """Printing an xvalue should work."""
+        self.build()
+        lldbutil.run_to_source_breakpoint(self, '// Break here', lldb.SBFileSpec("main.cpp"))
+        self.expect_expr("foo().data", result_value="1234")
diff --git a/lldb/test/API/commands/watchpoints/watchpoint_count/Makefile b/lldb/test/API/commands/watchpoints/watchpoint_count/Makefile
new file mode 100644
index 000000000000..10495940055b
--- /dev/null
+++ b/lldb/test/API/commands/watchpoints/watchpoint_count/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/watchpoints/watchpoint_count/TestWatchpointCount.py b/lldb/test/API/commands/watchpoints/watchpoint_count/TestWatchpointCount.py
new file mode 100644
index 000000000000..9ad21522b4aa
--- /dev/null
+++ b/lldb/test/API/commands/watchpoints/watchpoint_count/TestWatchpointCount.py
@@ -0,0 +1,44 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+class TestWatchpointCount(TestBase):
+    mydir = TestBase.compute_mydir(__file__)
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        TestBase.setUp(self)
+
+    @skipIf(oslist=["linux"], archs=["aarch64"])
+    def test_watchpoint_count(self):
+        self.build()
+        (_, process, thread, _) = lldbutil.run_to_source_breakpoint(self, "patatino", lldb.SBFileSpec("main.c"))
+        frame = thread.GetFrameAtIndex(0)
+        first_var = frame.FindVariable("x1")
+        second_var = frame.FindVariable("x2")
+
+        error = lldb.SBError()
+        first_watch = first_var.Watch(True, False, True, error)
+        if not error.Success():
+            self.fail(
+                "Failed to make watchpoint for x1: %s" %
+                (error.GetCString()))
+
+        second_watch = second_var.Watch(True, False, True, error)
+        if not error.Success():
+            self.fail(
+                "Failed to make watchpoint for x2: %s" %
+                (error.GetCString()))
+        process.Continue()
+
+        stop_reason = thread.GetStopReason()
+        self.assertEqual(stop_reason, lldb.eStopReasonWatchpoint, "watchpoint for x1 not hit")
+        stop_reason_descr = thread.GetStopDescription(256)
+        self.assertEqual(stop_reason_descr, "watchpoint 1")
+
+        process.Continue()
+        stop_reason = thread.GetStopReason()
+        self.assertEqual(stop_reason, lldb.eStopReasonWatchpoint, "watchpoint for x2 not hit")
+        stop_reason_descr = thread.GetStopDescription(256)
+        self.assertEqual(stop_reason_descr, "watchpoint 2")
diff --git a/lldb/test/API/commands/watchpoints/watchpoint_count/main.c b/lldb/test/API/commands/watchpoints/watchpoint_count/main.c
new file mode 100644
index 000000000000..fc9a370e41f3
--- /dev/null
+++ b/lldb/test/API/commands/watchpoints/watchpoint_count/main.c
@@ -0,0 +1,13 @@
+#include <stdint.h>
+#include <stdio.h>
+
+int main() {
+  uint8_t x1 = 0;
+  uint16_t x2 = 0;
+
+  printf("patatino\n");
+
+  x1 += 1;
+  x2 += 2;
+  return 0;
+}
diff --git a/lldb/test/API/functionalities/source-map/TestTargetSourceMap.py b/lldb/test/API/functionalities/source-map/TestTargetSourceMap.py
index ac03d80023e4..c9800e6f199e 100644
--- a/lldb/test/API/functionalities/source-map/TestTargetSourceMap.py
+++ b/lldb/test/API/functionalities/source-map/TestTargetSourceMap.py
@@ -1,6 +1,7 @@
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
+import os
 
 
 class TestTargetSourceMap(TestBase):
@@ -10,6 +11,21 @@ class TestTargetSourceMap(TestBase):
     @no_debug_info_test
     def test_source_map(self):
         """Test target.source-map' functionality."""
+
+        def assertBreakpointWithSourceMap(src_path):
+            # Set a breakpoint after we remap source and verify that it succeeds
+            bp = target.BreakpointCreateByLocation(src_path, 2)
+            self.assertEquals(bp.GetNumLocations(), 1,
+                            "make sure breakpoint was resolved with map")
+
+            # Now make sure that we can actually FIND the source file using this
+            # remapping:
+            retval = lldb.SBCommandReturnObject()
+            self.dbg.GetCommandInterpreter().HandleCommand("source list -f main.c -l 2", retval)
+            self.assertTrue(retval.Succeeded(), "source list didn't succeed.")
+            self.assertNotEqual(retval.GetOutput(), None, "We got no ouput from source list")
+            self.assertTrue("return" in retval.GetOutput(), "We didn't find the source file...")
+
         # Set the target soure map to map "./" to the current test directory
         src_dir = self.getSourceDir()
         src_path = os.path.join(src_dir, "main.c")
@@ -25,19 +41,68 @@ def test_source_map(self):
         bp = target.BreakpointCreateByLocation(src_path, 2)
         self.assertEquals(bp.GetNumLocations(), 0,
                         "make sure no breakpoints were resolved without map")
-        src_map_cmd = 'settings set target.source-map . "%s"' % (src_dir)
-        self.dbg.HandleCommand(src_map_cmd)
 
-        # Set a breakpoint after we remap source and verify that it succeeds
-        bp = target.BreakpointCreateByLocation(src_path, 2)
-        self.assertEquals(bp.GetNumLocations(), 1,
-                        "make sure breakpoint was resolved with map")
-
-        # Now make sure that we can actually FIND the source file using this
-        # remapping:
-        retval = lldb.SBCommandReturnObject()
-        self.dbg.GetCommandInterpreter().HandleCommand("source list -f main.c -l 2", retval)
-        self.assertTrue(retval.Succeeded(), "source list didn't succeed.")
-        self.assertNotEqual(retval.GetOutput(), None, "We got no ouput from source list")
-        self.assertTrue("return" in retval.GetOutput(), "We didn't find the source file...")
+        invalid_path = src_dir + "invalid_path"
+        invalid_path2 = src_dir + "invalid_path2"
+
+        # We make sure the error message contains all the invalid paths
+        self.expect(
+            'settings set target.source-map . "%s" . "%s" . "%s"' % (invalid_path, src_dir, invalid_path2),
+            substrs=[
+                'the replacement path doesn\'t exist: "%s"' % (invalid_path),
+                'the replacement path doesn\'t exist: "%s"' % (invalid_path2),
+            ],
+            error=True,
+        )
+        self.expect(
+            'settings show target.source-map',
+            substrs=['[0] "." -> "%s"' % (src_dir)],
+        )
+        assertBreakpointWithSourceMap(src_path)
+
+        # Index 0 is the valid mapping, and modifying it to an invalid one should have no effect
+        self.expect(
+            'settings replace target.source-map 0 . "%s"' % (invalid_path),
+            substrs=['error: the replacement path doesn\'t exist: "%s"' % (invalid_path)],
+            error=True,
+        )
+        self.expect(
+            'settings show target.source-map',
+            substrs=['[0] "." -> "%s"' % (src_dir)]
+        )
+        assertBreakpointWithSourceMap(src_path)
+
+        # Let's clear and add the mapping in with insert-after
+        self.runCmd('settings remove target.source-map 0')
+        self.expect(
+            'settings show target.source-map',
+            endstr="target.source-map (path-map) =\n",
+        )
         
+        # We add a valid but useless mapping so that we can use insert-after
+        another_valid_path = os.path.dirname(src_dir)
+        self.runCmd('settings set target.source-map . "%s"' % (another_valid_path))
+
+        self.expect(
+            'settings replace target.source-map 0 . "%s"' % (invalid_path),
+            substrs=['error: the replacement path doesn\'t exist: "%s"' % (invalid_path)],
+            error=True,
+        )
+        self.expect(
+            'settings show target.source-map',
+            substrs=['[0] "." -> "%s"' % (another_valid_path)]
+        )
+
+        # Let's clear and add the mapping in with append
+        self.expect('settings remove target.source-map 0')
+        self.expect(
+            'settings show target.source-map',
+            endstr="target.source-map (path-map) =\n",
+        )
+
+        self.expect(
+            'settings append target.source-map . "%s" . "%s"' % (invalid_path, src_dir),
+            substrs=['error: the replacement path doesn\'t exist: "%s"' % (invalid_path)],
+            error=True,
+        )
+        assertBreakpointWithSourceMap(src_path)
diff --git a/lldb/test/API/python_api/sbplatform/Makefile b/lldb/test/API/python_api/sbplatform/Makefile
new file mode 100644
index 000000000000..99998b20bcb0
--- /dev/null
+++ b/lldb/test/API/python_api/sbplatform/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/python_api/sbplatform/TestSBPlatform.py b/lldb/test/API/python_api/sbplatform/TestSBPlatform.py
new file mode 100644
index 000000000000..4735f6ea3b49
--- /dev/null
+++ b/lldb/test/API/python_api/sbplatform/TestSBPlatform.py
@@ -0,0 +1,22 @@
+"""Test the SBPlatform APIs."""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+class SBPlatformAPICase(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @add_test_categories(['pyapi'])
+    def test_run(self):
+        self.build()
+        plat = lldb.SBPlatform.GetHostPlatform()
+
+        os.environ["MY_TEST_ENV_VAR"]="SBPlatformAPICase.test_run"
+        def cleanup():
+            del os.environ["MY_TEST_ENV_VAR"]
+        self.addTearDownHook(cleanup)
+        cmd = lldb.SBPlatformShellCommand(self.getBuildArtifact("a.out"))
+        self.assertTrue(plat.Run(cmd).Success())
+        self.assertIn("MY_TEST_ENV_VAR=SBPlatformAPICase.test_run", cmd.GetOutput())
diff --git a/lldb/test/API/python_api/sbplatform/main.cpp b/lldb/test/API/python_api/sbplatform/main.cpp
new file mode 100644
index 000000000000..9f2aca26ab8d
--- /dev/null
+++ b/lldb/test/API/python_api/sbplatform/main.cpp
@@ -0,0 +1,8 @@
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("MY_TEST_ENV_VAR=%s\n", getenv("MY_TEST_ENV_VAR"));
+
+  return 0;
+}
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
index e5d4b05d987c..3e7bda88e6af 100644
--- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
@@ -1067,31 +1067,34 @@ uint32_t DNBArchMachARM64::GetHardwareWatchpointHit(nub_addr_t &addr) {
                    "DNBArchMachARM64::GetHardwareWatchpointHit() addr = 0x%llx",
                    (uint64_t)addr);
 
-  // This is the watchpoint value to match against, i.e., word address.
-  nub_addr_t wp_val = addr & ~((nub_addr_t)3);
   if (kret == KERN_SUCCESS) {
     DBG &debug_state = m_state.dbg;
     uint32_t i, num = NumSupportedHardwareWatchpoints();
     for (i = 0; i < num; ++i) {
       nub_addr_t wp_addr = GetWatchAddress(debug_state, i);
-      DNBLogThreadedIf(LOG_WATCHPOINTS, "DNBArchMachARM64::"
-                                        "GetHardwareWatchpointHit() slot: %u "
-                                        "(addr = 0x%llx).",
-                       i, (uint64_t)wp_addr);
-      if (wp_val == wp_addr) {
-        uint32_t byte_mask = bits(debug_state.__wcr[i], 12, 5);
-
-        // Sanity check the byte_mask, first.
-        if (LowestBitSet(byte_mask) < 0)
-          continue;
-
-        // Check that the watchpoint is enabled.
-        if (!IsWatchpointEnabled(debug_state, i))
-          continue;
-
-        // Compute the starting address (from the point of view of the
-        // debugger).
-        addr = wp_addr + LowestBitSet(byte_mask);
+      uint32_t byte_mask = bits(debug_state.__wcr[i], 12, 5);
+
+      DNBLogThreadedIf(LOG_WATCHPOINTS, "DNBArchImplX86_64::"
+                       "GetHardwareWatchpointHit() slot: %u "
+                       "(addr = 0x%llx; byte_mask = 0x%x)",
+                       i, static_cast<uint64_t>(wp_addr),
+                       byte_mask);
+
+      if (!IsWatchpointEnabled(debug_state, i))
+        continue;
+
+      if (bits(wp_addr, 48, 3) != bits(addr, 48, 3))
+        continue;
+
+      // Sanity check the byte_mask
+      uint32_t lsb = LowestBitSet(byte_mask);
+      if (lsb < 0)
+        continue;
+
+      uint64_t byte_to_match = bits(addr, 2, 0);
+
+      if (byte_mask & (1 << byte_to_match)) {
+        addr = wp_addr + lsb;
         return i;
       }
     }
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 058e76fc2b0c..f6c0a66f4407 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1107,14 +1107,16 @@ Currently, only the following parameter attributes are defined:
 .. _noalias:
 
 ``noalias``
-    This indicates that objects accessed via pointer values
+    This indicates that memory locations accessed via pointer values
     :ref:`based <pointeraliasing>` on the argument or return value are not also
     accessed, during the execution of the function, via pointer values not
-    *based* on the argument or return value. The attribute on a return value
-    also has additional semantics described below. The caller shares the
-    responsibility with the callee for ensuring that these requirements are met.
-    For further details, please see the discussion of the NoAlias response in
-    :ref:`alias analysis <Must, May, or No>`.
+    *based* on the argument or return value. This guarantee only holds for
+    memory locations that are *modified*, by any means, during the execution of
+    the function. The attribute on a return value also has additional semantics
+    described below. The caller shares the responsibility with the callee for
+    ensuring that these requirements are met.  For further details, please see
+    the discussion of the NoAlias response in :ref:`alias analysis <Must, May,
+    or No>`.
 
     Note that this definition of ``noalias`` is intentionally similar
     to the definition of ``restrict`` in C99 for function arguments.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index ce04592bf53e..5f5ef62f0139 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -233,31 +233,6 @@ class TargetTransformInfo {
   /// the EXT operation.
   int getExtCost(const Instruction *I, const Value *Src) const;
 
-  /// Estimate the cost of a function call when lowered.
-  ///
-  /// The contract for this is the same as \c getOperationCost except that it
-  /// supports an interface that provides extra information specific to call
-  /// instructions.
-  ///
-  /// This is the most basic query for estimating call cost: it only knows the
-  /// function type and (potentially) the number of arguments at the call site.
-  /// The latter is only interesting for varargs function types.
-  int getCallCost(FunctionType *FTy, int NumArgs = -1,
-                  const User *U = nullptr) const;
-
-  /// Estimate the cost of calling a specific function when lowered.
-  ///
-  /// This overload adds the ability to reason about the particular function
-  /// being called in the event it is a library call with special lowering.
-  int getCallCost(const Function *F, int NumArgs = -1,
-                  const User *U = nullptr) const;
-
-  /// Estimate the cost of calling a specific function when lowered.
-  ///
-  /// This overload allows specifying a set of candidate argument values.
-  int getCallCost(const Function *F, ArrayRef<const Value *> Arguments,
-                  const User *U = nullptr) const;
-
   /// \returns A value by which our inlining threshold should be multiplied.
   /// This is primarily used to bump up the inlining threshold wholesale on
   /// targets where calls are unusually expensive.
@@ -279,15 +254,11 @@ class TargetTransformInfo {
   int getInlinerVectorBonusPercent() const;
 
   /// Estimate the cost of an intrinsic when lowered.
-  ///
-  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
   int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                        ArrayRef<Type *> ParamTys,
                        const User *U = nullptr) const;
 
   /// Estimate the cost of an intrinsic when lowered.
-  ///
-  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
   int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                        ArrayRef<const Value *> Arguments,
                        const User *U = nullptr) const;
@@ -1206,10 +1177,6 @@ class TargetTransformInfo::Concept {
   virtual int getGEPCost(Type *PointeeType, const Value *Ptr,
                          ArrayRef<const Value *> Operands) = 0;
   virtual int getExtCost(const Instruction *I, const Value *Src) = 0;
-  virtual int getCallCost(FunctionType *FTy, int NumArgs, const User *U) = 0;
-  virtual int getCallCost(const Function *F, int NumArgs, const User *U) = 0;
-  virtual int getCallCost(const Function *F,
-                          ArrayRef<const Value *> Arguments, const User *U) = 0;
   virtual unsigned getInliningThresholdMultiplier() = 0;
   virtual int getInlinerVectorBonusPercent() = 0;
   virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
@@ -1455,16 +1422,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   int getExtCost(const Instruction *I, const Value *Src) override {
     return Impl.getExtCost(I, Src);
   }
-  int getCallCost(FunctionType *FTy, int NumArgs, const User *U) override {
-    return Impl.getCallCost(FTy, NumArgs, U);
-  }
-  int getCallCost(const Function *F, int NumArgs, const User *U) override {
-    return Impl.getCallCost(F, NumArgs, U);
-  }
-  int getCallCost(const Function *F,
-                  ArrayRef<const Value *> Arguments, const User *U) override {
-    return Impl.getCallCost(F, Arguments, U);
-  }
   unsigned getInliningThresholdMultiplier() override {
     return Impl.getInliningThresholdMultiplier();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 765d35a05a46..8749fa49010b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -132,21 +132,6 @@ class TargetTransformInfoImplBase {
     return TTI::TCC_Basic;
   }
 
-  unsigned getCallCost(FunctionType *FTy, int NumArgs, const User *U) {
-    assert(FTy && "FunctionType must be provided to this routine.");
-
-    // The target-independent implementation just measures the size of the
-    // function by approximating that each argument will take on average one
-    // instruction to prepare.
-
-    if (NumArgs < 0)
-      // Set the argument number to the number of explicit arguments in the
-      // function.
-      NumArgs = FTy->getNumParams();
-
-    return TTI::TCC_Basic * (NumArgs + 1);
-  }
-
   unsigned getInliningThresholdMultiplier() { return 1; }
 
   int getInlinerVectorBonusPercent() { return 150; }
@@ -726,37 +711,6 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
   explicit TargetTransformInfoImplCRTPBase(const DataLayout &DL) : BaseT(DL) {}
 
 public:
-  using BaseT::getCallCost;
-
-  unsigned getCallCost(const Function *F, int NumArgs, const User *U) {
-    assert(F && "A concrete function must be provided to this routine.");
-
-    if (NumArgs < 0)
-      // Set the argument number to the number of explicit arguments in the
-      // function.
-      NumArgs = F->arg_size();
-
-    if (Intrinsic::ID IID = F->getIntrinsicID()) {
-      FunctionType *FTy = F->getFunctionType();
-      SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
-      return static_cast<T *>(this)
-          ->getIntrinsicCost(IID, FTy->getReturnType(), ParamTys, U);
-    }
-
-    if (!static_cast<T *>(this)->isLoweredToCall(F))
-      return TTI::TCC_Basic; // Give a basic cost if it will be lowered
-                             // directly.
-
-    return static_cast<T *>(this)->getCallCost(F->getFunctionType(), NumArgs, U);
-  }
-
-  unsigned getCallCost(const Function *F, ArrayRef<const Value *> Arguments,
-                       const User *U) {
-    // Simply delegate to generic handling of the call.
-    // FIXME: We should use instsimplify or something else to catch calls which
-    // will constant fold with these arguments.
-    return static_cast<T *>(this)->getCallCost(F, Arguments.size(), U);
-  }
 
   using BaseT::getGEPCost;
 
@@ -898,15 +852,19 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
 
     if (auto CS = ImmutableCallSite(U)) {
       const Function *F = CS.getCalledFunction();
-      if (!F) {
-        // Just use the called value type.
-        Type *FTy = CS.getCalledValue()->getType()->getPointerElementType();
-        return TargetTTI->getCallCost(cast<FunctionType>(FTy),
-                                      CS.arg_size(), U);
-      }
+      if (F) {
+        FunctionType *FTy = F->getFunctionType();
+        if (Intrinsic::ID IID = F->getIntrinsicID()) {
+          SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
+          return TargetTTI->getIntrinsicCost(IID, FTy->getReturnType(), ParamTys, U);
+        }
 
-      SmallVector<const Value *, 8> Arguments(CS.arg_begin(), CS.arg_end());
-      return TargetTTI->getCallCost(F, Arguments, U);
+        if (!TargetTTI->isLoweredToCall(F))
+          return TTI::TCC_Basic; // Give a basic cost if it will be lowered
+
+        return TTI::TCC_Basic * (FTy->getNumParams() + 1);
+      }
+      return TTI::TCC_Basic * (CS.arg_size() + 1);
     }
 
     if (isa<SExtInst>(U) || isa<ZExtInst>(U) || isa<FPExtInst>(U))
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 6797ed2369d8..36aea31365c2 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -339,22 +339,8 @@ bool isSplatValue(const Value *V, int Index = -1, unsigned Depth = 0);
 ///
 /// This is the reverse process of "canWidenShuffleElements", but can always
 /// succeed.
-template <typename T>
-void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
-                      SmallVectorImpl<T> &ScaledMask) {
-  assert(Scale > 0 && "Unexpected scaling factor");
-
-  // Fast-path: if no scaling, then it is just a copy.
-  if (Scale == 1) {
-    ScaledMask.assign(Mask.begin(), Mask.end());
-    return;
-  }
-
-  ScaledMask.clear();
-  for (int MaskElt : Mask)
-    for (int ScaleElt = 0; ScaleElt != (int)Scale; ++ScaleElt)
-      ScaledMask.push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + ScaleElt);
-}
+void scaleShuffleMask(size_t Scale, ArrayRef<int> Mask,
+                      SmallVectorImpl<int> &ScaledMask);
 
 /// Compute a map of integer instructions to their minimum legal type
 /// size.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 723da1a4fd28..a4aa4a7cbd69 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -582,8 +582,9 @@ class IRTranslator : public MachineFunctionPass {
   /// Get the alignment of the given memory operation instruction. This will
   /// either be the explicitly specified value or the ABI-required alignment for
   /// the type being accessed (according to the Module's DataLayout).
-  /// FIXME: Remove once transition to Align is over.
-  inline unsigned getMemOpAlignment(const Instruction &I) {
+  LLVM_ATTRIBUTE_DEPRECATED(
+      inline unsigned getMemOpAlignment(const Instruction &I),
+      "Use getMemOpAlign instead") {
     return getMemOpAlign(I).value();
   }
 
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 069d0aa45095..c68b073ebb8c 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -142,6 +142,23 @@ template <> struct ScalarEnumerationTraits<MachineJumpTableInfo::JTEntryKind> {
   }
 };
 
+template <> struct ScalarTraits<MaybeAlign> {
+  static void output(const MaybeAlign &Alignment, void *,
+                     llvm::raw_ostream &out) {
+    out << uint64_t(Alignment ? Alignment->value() : 0U);
+  }
+  static StringRef input(StringRef Scalar, void *, MaybeAlign &Alignment) {
+    unsigned long long n;
+    if (getAsUnsignedInteger(Scalar, 10, n))
+      return "invalid number";
+    if (n > 0 && !isPowerOf2_64(n))
+      return "must be 0 or a power of two";
+    Alignment = MaybeAlign(n);
+    return StringRef();
+  }
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
@@ -212,7 +229,7 @@ struct MachineStackObject {
   ObjectType Type = DefaultType;
   int64_t Offset = 0;
   uint64_t Size = 0;
-  unsigned Alignment = 0;
+  MaybeAlign Alignment = None;
   TargetStackID::Value StackID;
   StringValue CalleeSavedRegister;
   bool CalleeSavedRestored = true;
@@ -252,7 +269,7 @@ template <> struct MappingTraits<MachineStackObject> {
     YamlIO.mapOptional("offset", Object.Offset, (int64_t)0);
     if (Object.Type != MachineStackObject::VariableSized)
       YamlIO.mapRequired("size", Object.Size);
-    YamlIO.mapOptional("alignment", Object.Alignment, (unsigned)0);
+    YamlIO.mapOptional("alignment", Object.Alignment, None);
     YamlIO.mapOptional("stack-id", Object.StackID, TargetStackID::Default);
     YamlIO.mapOptional("callee-saved-register", Object.CalleeSavedRegister,
                        StringValue()); // Don't print it out when it's empty.
@@ -278,7 +295,7 @@ struct FixedMachineStackObject {
   ObjectType Type = DefaultType;
   int64_t Offset = 0;
   uint64_t Size = 0;
-  unsigned Alignment = 0;
+  MaybeAlign Alignment = None;
   TargetStackID::Value StackID;
   bool IsImmutable = false;
   bool IsAliased = false;
@@ -327,7 +344,7 @@ template <> struct MappingTraits<FixedMachineStackObject> {
         FixedMachineStackObject::DefaultType); // Don't print the default type.
     YamlIO.mapOptional("offset", Object.Offset, (int64_t)0);
     YamlIO.mapOptional("size", Object.Size, (uint64_t)0);
-    YamlIO.mapOptional("alignment", Object.Alignment, (unsigned)0);
+    YamlIO.mapOptional("alignment", Object.Alignment, None);
     YamlIO.mapOptional("stack-id", Object.StackID, TargetStackID::Default);
     if (Object.Type != FixedMachineStackObject::SpillSlot) {
       YamlIO.mapOptional("isImmutable", Object.IsImmutable, false);
@@ -411,7 +428,7 @@ template <> struct MappingTraits<CallSiteInfo> {
 struct MachineConstantPoolValue {
   UnsignedValue ID;
   StringValue Value;
-  unsigned Alignment = 0;
+  MaybeAlign Alignment = None;
   bool IsTargetSpecific = false;
 
   bool operator==(const MachineConstantPoolValue &Other) const {
@@ -425,7 +442,7 @@ template <> struct MappingTraits<MachineConstantPoolValue> {
   static void mapping(IO &YamlIO, MachineConstantPoolValue &Constant) {
     YamlIO.mapRequired("id", Constant.ID);
     YamlIO.mapOptional("value", Constant.Value, StringValue());
-    YamlIO.mapOptional("alignment", Constant.Alignment, (unsigned)0);
+    YamlIO.mapOptional("alignment", Constant.Alignment, None);
     YamlIO.mapOptional("isTargetSpecific", Constant.IsTargetSpecific, false);
   }
 };
@@ -571,7 +588,7 @@ template <> struct MappingTraits<std::unique_ptr<MachineFunctionInfo>> {
 
 struct MachineFunction {
   StringRef Name;
-  unsigned Alignment = 0;
+  MaybeAlign Alignment = None;
   bool ExposesReturnsTwice = false;
   // GISel MachineFunctionProperties.
   bool Legalized = false;
@@ -599,7 +616,7 @@ struct MachineFunction {
 template <> struct MappingTraits<MachineFunction> {
   static void mapping(IO &YamlIO, MachineFunction &MF) {
     YamlIO.mapRequired("name", MF.Name);
-    YamlIO.mapOptional("alignment", MF.Alignment, (unsigned)0);
+    YamlIO.mapOptional("alignment", MF.Alignment, None);
     YamlIO.mapOptional("exposesReturnsTwice", MF.ExposesReturnsTwice, false);
     YamlIO.mapOptional("legalized", MF.Legalized, false);
     YamlIO.mapOptional("regBankSelected", MF.RegBankSelected, false);
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 03811bc5145c..f0fb7655881b 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -462,10 +462,10 @@ class MachineFrameInfo {
 
   /// Return the alignment of the specified stack object.
   /// FIXME: Remove this function once transition to Align is over.
-  unsigned getObjectAlignment(int ObjectIdx) const {
-    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
-           "Invalid Object Idx!");
-    return Objects[ObjectIdx + NumFixedObjects].Alignment.value();
+  LLVM_ATTRIBUTE_DEPRECATED(inline unsigned getObjectAlignment(int ObjectIdx)
+                                const,
+                            "Use getObjectAlign instead") {
+    return getObjectAlign(ObjectIdx).value();
   }
 
   /// Return the alignment of the specified stack object.
@@ -475,18 +475,6 @@ class MachineFrameInfo {
     return Objects[ObjectIdx + NumFixedObjects].Alignment;
   }
 
-  /// setObjectAlignment - Change the alignment of the specified stack object.
-  /// FIXME: Remove this function once transition to Align is over.
-  void setObjectAlignment(int ObjectIdx, unsigned Align) {
-    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
-           "Invalid Object Idx!");
-    Objects[ObjectIdx + NumFixedObjects].Alignment = assumeAligned(Align);
-
-    // Only ensure max alignment for the default stack.
-    if (getStackID(ObjectIdx) == 0)
-      ensureMaxAlignment(assumeAligned(Align));
-  }
-
   /// setObjectAlignment - Change the alignment of the specified stack object.
   void setObjectAlignment(int ObjectIdx, Align Alignment) {
     assert(unsigned(ObjectIdx + NumFixedObjects) < Objects.size() &&
@@ -498,6 +486,14 @@ class MachineFrameInfo {
       ensureMaxAlignment(Alignment);
   }
 
+  /// setObjectAlignment - Change the alignment of the specified stack object.
+  /// FIXME: Remove this function once transition to Align is over.
+  LLVM_ATTRIBUTE_DEPRECATED(inline void setObjectAlignment(int ObjectIdx,
+                                                           unsigned Align),
+                            "Use the version that takes Align instead") {
+    setObjectAlignment(ObjectIdx, assumeAligned(Align));
+  }
+
   /// Return the underlying Alloca of the specified
   /// stack object if it exists. Returns 0 if none exists.
   const AllocaInst* getObjectAllocation(int ObjectIdx) const {
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 560d098e0a12..62ad8242ef71 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1114,14 +1114,36 @@ class SelectionDAG {
   /// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not
   /// less than FIRST_TARGET_MEMORY_OPCODE.
   SDValue getMemIntrinsicNode(
-    unsigned Opcode, const SDLoc &dl, SDVTList VTList,
-    ArrayRef<SDValue> Ops, EVT MemVT,
-    MachinePointerInfo PtrInfo,
-    unsigned Align = 0,
-    MachineMemOperand::Flags Flags
-    = MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
-    uint64_t Size = 0,
-    const AAMDNodes &AAInfo = AAMDNodes());
+      unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
+      EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
+      MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
+                                       MachineMemOperand::MOStore,
+      uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
+
+  inline SDValue getMemIntrinsicNode(
+      unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
+      EVT MemVT, MachinePointerInfo PtrInfo, MaybeAlign Alignment = None,
+      MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
+                                       MachineMemOperand::MOStore,
+      uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
+    // Ensure that codegen never sees alignment 0
+    return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
+                               Alignment.getValueOr(getEVTAlign(MemVT)), Flags,
+                               Size, AAInfo);
+  }
+
+  LLVM_ATTRIBUTE_DEPRECATED(
+      inline SDValue getMemIntrinsicNode(
+          unsigned Opcode, const SDLoc &dl, SDVTList VTList,
+          ArrayRef<SDValue> Ops, EVT MemVT, MachinePointerInfo PtrInfo,
+          unsigned Alignment,
+          MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
+                                           MachineMemOperand::MOStore,
+          uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()),
+      "") {
+    return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
+                               MaybeAlign(Alignment), Flags, Size, AAInfo);
+  }
 
   SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
                               ArrayRef<SDValue> Ops, EVT MemVT,
@@ -1793,9 +1815,17 @@ class SelectionDAG {
   bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base,
                                       unsigned Bytes, int Dist) const;
 
-  /// Infer alignment of a load / store address. Return 0 if
-  /// it cannot be inferred.
-  unsigned InferPtrAlignment(SDValue Ptr) const;
+  /// Infer alignment of a load / store address. Return None if it cannot be
+  /// inferred.
+  MaybeAlign InferPtrAlign(SDValue Ptr) const;
+
+  LLVM_ATTRIBUTE_DEPRECATED(inline unsigned InferPtrAlignment(SDValue Ptr)
+                                const,
+                            "Use InferPtrAlign instead") {
+    if (auto A = InferPtrAlign(Ptr))
+      return A->value();
+    return 0;
+  }
 
   /// Compute the VTs needed for the low/hi parts of a type
   /// which is split (or expanded) into two not necessarily identical pieces.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index fefa8daa60a1..99601c436651 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1962,6 +1962,18 @@ class TargetLoweringBase {
     return ISD::ZERO_EXTEND;
   }
 
+  /// Returns how the platform's atomic compare and swap expects its comparison
+  /// value to be extended (ZERO_EXTEND, SIGN_EXTEND, or ANY_EXTEND). This is
+  /// separate from getExtendForAtomicOps, which is concerned with the
+  /// sign-extension of the instruction's output, whereas here we are concerned
+  /// with the sign-extension of the input. For targets with compare-and-swap
+  /// instructions (or sub-word comparisons in their LL/SC loop expansions),
+  /// the input can be ANY_EXTEND, but the output will still have a specific
+  /// extension.
+  virtual ISD::NodeType getExtendForAtomicCmpSwapArg() const {
+    return ISD::ANY_EXTEND;
+  }
+
   /// @}
 
   /// Returns true if we should normalize
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 441713c2595d..be5152d09ad2 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -1309,11 +1309,14 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &&MU) {
   if (MU->getSymbols().empty()) {
     // Empty MUs are allowable but pathological, so issue a warning.
     DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Warning: Discarding empty MU " << MU->getName() << "\n";
+      dbgs() << "Warning: Discarding empty MU " << MU->getName() << " for "
+             << getName() << "\n";
     });
     return Error::success();
   } else
-    DEBUG_WITH_TYPE("orc", dbgs() << "Defining MU " << MU->getName() << ":\n");
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "Defining MU " << MU->getName() << " for " << getName() << "\n";
+    });
 
   return ES.runSessionLocked([&, this]() -> Error {
     if (auto Err = defineImpl(*MU))
@@ -1340,11 +1343,14 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU) {
   if (MU->getSymbols().empty()) {
     // Empty MUs are allowable but pathological, so issue a warning.
     DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Warning: Discarding empty MU " << MU->getName() << "\n";
+      dbgs() << "Warning: Discarding empty MU " << MU->getName() << getName()
+             << "\n";
     });
     return Error::success();
   } else
-    DEBUG_WITH_TYPE("orc", dbgs() << "Defining MU " << MU->getName() << ":\n");
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "Defining MU " << MU->getName() << " for " << getName() << "\n";
+    });
 
   return ES.runSessionLocked([&, this]() -> Error {
     if (auto Err = defineImpl(*MU))
diff --git a/llvm/include/llvm/MC/LaneBitmask.h b/llvm/include/llvm/MC/LaneBitmask.h
index b070bea3201c..a467407f1706 100644
--- a/llvm/include/llvm/MC/LaneBitmask.h
+++ b/llvm/include/llvm/MC/LaneBitmask.h
@@ -40,7 +40,7 @@ namespace llvm {
     // When changing the underlying type, change the format string as well.
     using Type = uint64_t;
     enum : unsigned { BitWidth = 8*sizeof(Type) };
-    constexpr static const char *const FormatStr = "%016lX";
+    constexpr static const char *const FormatStr = "%016llX";
 
     constexpr LaneBitmask() = default;
     explicit constexpr LaneBitmask(Type V) : Mask(V) {}
diff --git a/llvm/include/llvm/MC/MCDirectives.h b/llvm/include/llvm/MC/MCDirectives.h
index ea79e68674e5..cad08c8574d2 100644
--- a/llvm/include/llvm/MC/MCDirectives.h
+++ b/llvm/include/llvm/MC/MCDirectives.h
@@ -16,34 +16,34 @@
 namespace llvm {
 
 enum MCSymbolAttr {
-  MCSA_Invalid = 0,    ///< Not a valid directive.
+  MCSA_Invalid = 0, ///< Not a valid directive.
 
   // Various directives in alphabetical order.
-  MCSA_Cold,                ///< .cold (MachO)
-  MCSA_ELF_TypeFunction,    ///< .type _foo, STT_FUNC  # aka @function
-  MCSA_ELF_TypeIndFunction, ///< .type _foo, STT_GNU_IFUNC
-  MCSA_ELF_TypeObject,      ///< .type _foo, STT_OBJECT  # aka @object
-  MCSA_ELF_TypeTLS,         ///< .type _foo, STT_TLS     # aka @tls_object
-  MCSA_ELF_TypeCommon,      ///< .type _foo, STT_COMMON  # aka @common
-  MCSA_ELF_TypeNoType,      ///< .type _foo, STT_NOTYPE  # aka @notype
+  MCSA_Cold,                    ///< .cold (MachO)
+  MCSA_ELF_TypeFunction,        ///< .type _foo, STT_FUNC  # aka @function
+  MCSA_ELF_TypeIndFunction,     ///< .type _foo, STT_GNU_IFUNC
+  MCSA_ELF_TypeObject,          ///< .type _foo, STT_OBJECT  # aka @object
+  MCSA_ELF_TypeTLS,             ///< .type _foo, STT_TLS     # aka @tls_object
+  MCSA_ELF_TypeCommon,          ///< .type _foo, STT_COMMON  # aka @common
+  MCSA_ELF_TypeNoType,          ///< .type _foo, STT_NOTYPE  # aka @notype
   MCSA_ELF_TypeGnuUniqueObject, /// .type _foo, @gnu_unique_object
-  MCSA_Global,              ///< .globl
-  MCSA_LGlobal,             ///< .lglobl (XCOFF)
-  MCSA_Hidden,              ///< .hidden (ELF)
-  MCSA_IndirectSymbol,      ///< .indirect_symbol (MachO)
-  MCSA_Internal,            ///< .internal (ELF)
-  MCSA_LazyReference,       ///< .lazy_reference (MachO)
-  MCSA_Local,               ///< .local (ELF)
-  MCSA_NoDeadStrip,         ///< .no_dead_strip (MachO)
-  MCSA_SymbolResolver,      ///< .symbol_resolver (MachO)
-  MCSA_AltEntry,            ///< .alt_entry (MachO)
-  MCSA_PrivateExtern,       ///< .private_extern (MachO)
-  MCSA_Protected,           ///< .protected (ELF)
-  MCSA_Reference,           ///< .reference (MachO)
-  MCSA_Weak,                ///< .weak
-  MCSA_WeakDefinition,      ///< .weak_definition (MachO)
-  MCSA_WeakReference,       ///< .weak_reference (MachO)
-  MCSA_WeakDefAutoPrivate   ///< .weak_def_can_be_hidden (MachO)
+  MCSA_Global,                  ///< .globl
+  MCSA_LGlobal,                 ///< .lglobl (XCOFF)
+  MCSA_Hidden,                  ///< .hidden (ELF)
+  MCSA_IndirectSymbol,          ///< .indirect_symbol (MachO)
+  MCSA_Internal,                ///< .internal (ELF)
+  MCSA_LazyReference,           ///< .lazy_reference (MachO)
+  MCSA_Local,                   ///< .local (ELF)
+  MCSA_NoDeadStrip,             ///< .no_dead_strip (MachO)
+  MCSA_SymbolResolver,          ///< .symbol_resolver (MachO)
+  MCSA_AltEntry,                ///< .alt_entry (MachO)
+  MCSA_PrivateExtern,           ///< .private_extern (MachO)
+  MCSA_Protected,               ///< .protected (ELF)
+  MCSA_Reference,               ///< .reference (MachO)
+  MCSA_Weak,                    ///< .weak
+  MCSA_WeakDefinition,          ///< .weak_definition (MachO)
+  MCSA_WeakReference,           ///< .weak_reference (MachO)
+  MCSA_WeakDefAutoPrivate       ///< .weak_def_can_be_hidden (MachO)
 };
 
 enum MCAssemblerFlag {
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index bde0835b6a55..4c8a895592ef 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -259,6 +259,8 @@ class MCRelaxableFragment : public MCEncodedFragmentWithFixups<8, 1> {
 
   /// The instruction this is a fragment for.
   MCInst Inst;
+  /// Can we auto pad the instruction?
+  bool AllowAutoPadding = false;
 
 public:
   MCRelaxableFragment(const MCInst &Inst, const MCSubtargetInfo &STI,
@@ -269,6 +271,9 @@ class MCRelaxableFragment : public MCEncodedFragmentWithFixups<8, 1> {
   const MCInst &getInst() const { return Inst; }
   void setInst(const MCInst &Value) { Inst = Value; }
 
+  bool getAllowAutoPadding() const { return AllowAutoPadding; }
+  void setAllowAutoPadding(bool V) { AllowAutoPadding = V; }
+
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Relaxable;
   }
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 252f0c8e212e..1b12a9b23130 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -282,6 +282,11 @@ struct HashSection : Section {
   Optional<std::vector<uint32_t>> Bucket;
   Optional<std::vector<uint32_t>> Chain;
 
+  // The following members are used to override section fields.
+  // This is useful for creating invalid objects.
+  Optional<llvm::yaml::Hex64> NBucket;
+  Optional<llvm::yaml::Hex64> NChain;
+
   HashSection() : Section(ChunkKind::Hash) {}
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Hash; }
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 2113197c067d..e5c50bdfbd5c 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -112,6 +112,7 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/KnowledgeRetention.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 
 namespace llvm {
@@ -700,11 +701,7 @@ struct Attributor {
       : Functions(Functions), InfoCache(InfoCache), CGUpdater(CGUpdater),
         DepRecomputeInterval(DepRecomputeInterval), Whitelist(Whitelist) {}
 
-  ~Attributor() {
-    DeleteContainerPointers(AllAbstractAttributes);
-    for (auto &It : ArgumentReplacementMap)
-      DeleteContainerPointers(It.second);
-  }
+  ~Attributor();
 
   /// Run the analyses until a fixpoint is reached or enforced (timeout).
   ///
@@ -1070,6 +1067,9 @@ struct Attributor {
   /// Return the data layout associated with the anchor scope.
   const DataLayout &getDataLayout() const { return InfoCache.DL; }
 
+  /// The allocator used to allocate memory, e.g. for `AbstractAttribute`s.
+  BumpPtrAllocator Allocator;
+
 private:
   /// Check \p Pred on all call sites of \p Fn.
   ///
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2ae44caaaa32..a240571a39da 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -153,21 +153,6 @@ int TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty,
   return Cost;
 }
 
-int TargetTransformInfo::getCallCost(FunctionType *FTy, int NumArgs,
-                                     const User *U) const {
-  int Cost = TTIImpl->getCallCost(FTy, NumArgs, U);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
-int TargetTransformInfo::getCallCost(const Function *F,
-                                     ArrayRef<const Value *> Arguments,
-                                     const User *U) const {
-  int Cost = TTIImpl->getCallCost(F, Arguments, U);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
 unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
   return TTIImpl->getInliningThresholdMultiplier();
 }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index c8ca2052919c..8b98e05a6884 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -397,6 +397,22 @@ bool llvm::isSplatValue(const Value *V, int Index, unsigned Depth) {
   return false;
 }
 
+void llvm::scaleShuffleMask(size_t Scale, ArrayRef<int> Mask,
+                            SmallVectorImpl<int> &ScaledMask) {
+  assert(Scale > 0 && "Unexpected scaling factor");
+
+  // Fast-path: if no scaling, then it is just a copy.
+  if (Scale == 1) {
+    ScaledMask.assign(Mask.begin(), Mask.end());
+    return;
+  }
+
+  ScaledMask.clear();
+  for (int MaskElt : Mask)
+    for (int ScaleElt = 0; ScaleElt != (int)Scale; ++ScaleElt)
+      ScaledMask.push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + ScaleElt);
+}
+
 MapVector<Instruction *, uint64_t>
 llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
                                const TargetTransformInfo *TTI) {
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 213af320531c..75487075cb7a 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -34,7 +34,7 @@ GISelKnownBits::GISelKnownBits(MachineFunction &MF, unsigned MaxDepth)
 Align GISelKnownBits::inferAlignmentForFrameIdx(int FrameIdx, int Offset,
                                                 const MachineFunction &MF) {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return commonAlignment(Align(MFI.getObjectAlignment(FrameIdx)), Offset);
+  return commonAlignment(MFI.getObjectAlign(FrameIdx), Offset);
   // TODO: How to handle cases with Base + Offset?
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 454a63e674d2..bb144480bbd4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -634,6 +634,7 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
 
   MIRBuilder.setInstr(MI);
+  MIRBuilder.setDebugLoc(MI.getDebugLoc());
 
   switch (MI.getOpcode()) {
   default:
@@ -731,6 +732,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                                                               unsigned TypeIdx,
                                                               LLT NarrowTy) {
   MIRBuilder.setInstr(MI);
+  MIRBuilder.setDebugLoc(MI.getDebugLoc());
 
   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   uint64_t NarrowSize = NarrowTy.getSizeInBits();
@@ -1596,6 +1598,7 @@ LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   MIRBuilder.setInstr(MI);
+  MIRBuilder.setDebugLoc(MI.getDebugLoc());
 
   switch (MI.getOpcode()) {
   default:
@@ -2188,6 +2191,7 @@ LegalizerHelper::LegalizeResult
 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   using namespace TargetOpcode;
   MIRBuilder.setInstr(MI);
+  MIRBuilder.setDebugLoc(MI.getDebugLoc());
 
   switch(MI.getOpcode()) {
   default:
@@ -3223,6 +3227,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   using namespace TargetOpcode;
 
   MIRBuilder.setInstr(MI);
+  MIRBuilder.setDebugLoc(MI.getDebugLoc());
   switch (MI.getOpcode()) {
   case G_IMPLICIT_DEF:
     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 5022726dc70a..6c5ef0255a08 100644
--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -79,11 +79,11 @@ namespace {
     using StackObjSet = SmallSetVector<int, 8>;
 
     void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, int64_t &Offset,
-                           bool StackGrowsDown, unsigned &MaxAlign);
+                           bool StackGrowsDown, Align &MaxAlign);
     void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
                                SmallSet<int, 16> &ProtectedObjs,
                                MachineFrameInfo &MFI, bool StackGrowsDown,
-                               int64_t &Offset, unsigned &MaxAlign);
+                               int64_t &Offset, Align &MaxAlign);
     void calculateFrameObjectOffsets(MachineFunction &Fn);
     bool insertFrameReferenceRegisters(MachineFunction &Fn);
 
@@ -140,22 +140,21 @@ bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
 }
 
 /// AdjustStackOffset - Helper function used to adjust the stack frame offset.
-void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI,
-                                           int FrameIdx, int64_t &Offset,
-                                           bool StackGrowsDown,
-                                           unsigned &MaxAlign) {
+void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                                           int64_t &Offset, bool StackGrowsDown,
+                                           Align &MaxAlign) {
   // If the stack grows down, add the object size to find the lowest address.
   if (StackGrowsDown)
     Offset += MFI.getObjectSize(FrameIdx);
 
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+  Align Alignment = MFI.getObjectAlign(FrameIdx);
 
   // If the alignment of this object is greater than that of the stack, then
   // increase the stack alignment to match.
-  MaxAlign = std::max(MaxAlign, Align);
+  MaxAlign = std::max(MaxAlign, Alignment);
 
   // Adjust to alignment boundary.
-  Offset = (Offset + Align - 1) / Align * Align;
+  Offset = alignTo(Offset, Alignment);
 
   int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
   LLVM_DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
@@ -173,11 +172,10 @@ void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI,
 
 /// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
 /// those required to be close to the Stack Protector) to stack offsets.
-void LocalStackSlotPass::AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
-                                           SmallSet<int, 16> &ProtectedObjs,
-                                           MachineFrameInfo &MFI,
-                                           bool StackGrowsDown, int64_t &Offset,
-                                           unsigned &MaxAlign) {
+void LocalStackSlotPass::AssignProtectedObjSet(
+    const StackObjSet &UnassignedObjs, SmallSet<int, 16> &ProtectedObjs,
+    MachineFrameInfo &MFI, bool StackGrowsDown, int64_t &Offset,
+    Align &MaxAlign) {
   for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
         E = UnassignedObjs.end(); I != E; ++I) {
     int i = *I;
@@ -195,7 +193,7 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int64_t Offset = 0;
-  unsigned MaxAlign = 0;
+  Align MaxAlign;
 
   // Make sure that the stack protector comes before the local variables on the
   // stack.
@@ -262,7 +260,7 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // Remember how big this blob of stack space is
   MFI.setLocalFrameSize(Offset);
-  MFI.setLocalFrameMaxAlign(assumeAligned(MaxAlign));
+  MFI.setLocalFrameMaxAlign(MaxAlign);
 }
 
 static inline bool
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index cad0a8d0899a..135b2f2234af 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -401,8 +401,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
     Target.reset(new PerTargetMIParsingState(MF.getSubtarget()));
   }
 
-  if (YamlMF.Alignment)
-    MF.setAlignment(Align(YamlMF.Alignment));
+  MF.setAlignment(YamlMF.Alignment.valueOrOne());
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasWinCFI(YamlMF.HasWinCFI);
 
@@ -691,7 +690,7 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
       return error(Object.ID.SourceRange.Start,
                    Twine("StackID is not supported by target"));
     MFI.setStackID(ObjectIdx, Object.StackID);
-    MFI.setObjectAlignment(ObjectIdx, Object.Alignment);
+    MFI.setObjectAlignment(ObjectIdx, Object.Alignment.valueOrOne());
     if (!PFS.FixedStackObjectSlots.insert(std::make_pair(Object.ID.Value,
                                                          ObjectIdx))
              .second)
@@ -723,10 +722,11 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
       return error(Object.ID.SourceRange.Start,
                    Twine("StackID is not supported by target"));
     if (Object.Type == yaml::MachineStackObject::VariableSized)
-      ObjectIdx = MFI.CreateVariableSizedObject(Object.Alignment, Alloca);
+      ObjectIdx =
+          MFI.CreateVariableSizedObject(Object.Alignment.valueOrOne(), Alloca);
     else
       ObjectIdx = MFI.CreateStackObject(
-          Object.Size, Object.Alignment,
+          Object.Size, Object.Alignment.valueOrOne(),
           Object.Type == yaml::MachineStackObject::SpillSlot, Alloca,
           Object.StackID);
     MFI.setObjectOffset(ObjectIdx, Object.Offset);
@@ -838,11 +838,11 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
         parseConstantValue(YamlConstant.Value.Value, Error, M));
     if (!Value)
       return error(Error, YamlConstant.Value.SourceRange);
-    unsigned Alignment =
-        YamlConstant.Alignment
-            ? YamlConstant.Alignment
-            : M.getDataLayout().getPrefTypeAlignment(Value->getType());
-    unsigned Index = ConstantPool.getConstantPoolIndex(Value, Alignment);
+    const Align PrefTypeAlign =
+        M.getDataLayout().getPrefTypeAlign(Value->getType());
+    const Align Alignment = YamlConstant.Alignment.getValueOr(PrefTypeAlign);
+    unsigned Index =
+        ConstantPool.getConstantPoolIndex(Value, Alignment.value());
     if (!ConstantPoolSlots.insert(std::make_pair(YamlConstant.ID.Value, Index))
              .second)
       return error(YamlConstant.ID.SourceRange.Start,
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 22f7e1644a48..58eb720ca799 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -198,7 +198,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
 
   yaml::MachineFunction YamlMF;
   YamlMF.Name = MF.getName();
-  YamlMF.Alignment = MF.getAlignment().value();
+  YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasWinCFI = MF.hasWinCFI();
 
@@ -373,7 +373,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
                           : yaml::FixedMachineStackObject::DefaultType;
     YamlObject.Offset = MFI.getObjectOffset(I);
     YamlObject.Size = MFI.getObjectSize(I);
-    YamlObject.Alignment = MFI.getObjectAlignment(I);
+    YamlObject.Alignment = MFI.getObjectAlign(I);
     YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
     YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I);
     YamlObject.IsAliased = MFI.isAliasedObjectIndex(I);
@@ -400,7 +400,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
                                 : yaml::MachineStackObject::DefaultType;
     YamlObject.Offset = MFI.getObjectOffset(I);
     YamlObject.Size = MFI.getObjectSize(I);
-    YamlObject.Alignment = MFI.getObjectAlignment(I);
+    YamlObject.Alignment = MFI.getObjectAlign(I);
     YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
 
     YMF.StackObjects.push_back(YamlObject);
@@ -514,7 +514,7 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
     yaml::MachineConstantPoolValue YamlConstant;
     YamlConstant.ID = ID++;
     YamlConstant.Value = StrOS.str();
-    YamlConstant.Alignment = Constant.getAlignment();
+    YamlConstant.Alignment = MaybeAlign(Constant.getAlignment());
     YamlConstant.IsTargetSpecific = Constant.isMachineConstantPoolEntry();
 
     MF.Constants.push_back(YamlConstant);
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index a5bea1463468..41b6de1441d7 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -466,7 +466,7 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
   const MachineFunction &MF = *Before->getMF();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned NeedSize = TRI->getSpillSize(RC);
-  unsigned NeedAlign = TRI->getSpillAlignment(RC);
+  Align NeedAlign = TRI->getSpillAlign(RC);
 
   unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();
   int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
@@ -478,7 +478,7 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
     if (FI < FIB || FI >= FIE)
       continue;
     unsigned S = MFI.getObjectSize(FI);
-    unsigned A = MFI.getObjectAlignment(FI);
+    Align A = MFI.getObjectAlign(FI);
     if (NeedSize > S || NeedAlign > A)
       continue;
     // Avoid wasting slots with large size and/or large alignment. Pick one
@@ -487,7 +487,7 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
     // larger register is reserved before a slot for a smaller one. When
     // trying to spill a smaller register, the large slot would be found
     // first, thus making it impossible to spill the larger register later.
-    unsigned D = (S-NeedSize) + (A-NeedAlign);
+    unsigned D = (S - NeedSize) + (A.value() - NeedAlign.value());
     if (D < Diff) {
       SI = I;
       Diff = D;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8dc1c538ee39..77fa5c793c8e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13109,8 +13109,12 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
   SDNodeFlags Flags = N->getFlags();
-  if (!DAG.getTarget().Options.UnsafeFPMath &&
-      !Flags.hasApproximateFuncs())
+  const TargetOptions &Options = DAG.getTarget().Options;
+
+  // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
+  // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
+  if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) ||
+      (!Options.NoInfsFPMath && !Flags.hasNoInfs()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -14629,11 +14633,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {
-    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) {
+    if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
+      if (*Alignment > LD->getAlign() &&
+          isAligned(*Alignment, LD->getSrcValueOffset())) {
         SDValue NewLoad = DAG.getExtLoad(
             LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
-            LD->getPointerInfo(), LD->getMemoryVT(), Align,
+            LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,
             LD->getMemOperand()->getFlags(), LD->getAAInfo());
         // NewLoad will always be N as we are only refining the alignment
         assert(NewLoad.getNode() == N);
@@ -16695,11 +16700,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // Try to infer better alignment information than the store already has.
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {
-    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) {
+    if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
+      if (*Alignment > ST->getAlign() &&
+          isAligned(*Alignment, ST->getSrcValueOffset())) {
         SDValue NewStore =
             DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
-                              ST->getMemoryVT(), Align,
+                              ST->getMemoryVT(), *Alignment,
                               ST->getMemOperand()->getFlags(), ST->getAAInfo());
         // NewStore will always be N as we are only refining the alignment
         assert(NewStore.getNode() == N);
@@ -19815,8 +19821,8 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
         ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
         SmallVector<int, 8> InnerMask;
         SmallVector<int, 8> OuterMask;
-        scaleShuffleMask<int>(InnerScale, InnerSVN->getMask(), InnerMask);
-        scaleShuffleMask<int>(OuterScale, SVN->getMask(), OuterMask);
+        scaleShuffleMask(InnerScale, InnerSVN->getMask(), InnerMask);
+        scaleShuffleMask(OuterScale, SVN->getMask(), OuterMask);
 
         // Merge the shuffle masks.
         SmallVector<int, 8> NewMask;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 0248b5121e3f..ed67f7dc8ea3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -278,8 +278,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
     return Res.getValue(1);
   }
 
-  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  // Op2 is used for the comparison and thus must be extended according to the
+  // target's atomic operations. Op3 is merely stored and so can be left alone.
+  SDValue Op2 = N->getOperand(2);
   SDValue Op3 = GetPromotedInteger(N->getOperand(3));
+  switch (TLI.getExtendForAtomicCmpSwapArg()) {
+  case ISD::SIGN_EXTEND:
+    Op2 = SExtPromotedInteger(Op2);
+    break;
+  case ISD::ZERO_EXTEND:
+    Op2 = ZExtPromotedInteger(Op2);
+    break;
+  case ISD::ANY_EXTEND:
+    Op2 = GetPromotedInteger(Op2);
+    break;
+  default:
+    llvm_unreachable("Invalid atomic op extension");
+  }
+
   SDVTList VTs =
       DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other);
   SDValue Res = DAG.getAtomicCmpSwap(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 98bbaefbb584..4c8e95e7b256 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5916,7 +5916,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
-  MaybeAlign SrcAlign(DAG.InferPtrAlignment(Src));
+  MaybeAlign SrcAlign = DAG.InferPtrAlign(Src);
   if (!SrcAlign || Alignment > *SrcAlign)
     SrcAlign = Alignment;
   assert(SrcAlign && "SrcAlign must be set");
@@ -6101,7 +6101,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
-  MaybeAlign SrcAlign(DAG.InferPtrAlignment(Src));
+  MaybeAlign SrcAlign = DAG.InferPtrAlign(Src);
   if (!SrcAlign || Alignment > *SrcAlign)
     SrcAlign = Alignment;
   assert(SrcAlign && "SrcAlign must be set");
@@ -6679,7 +6679,7 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
 
 SDValue SelectionDAG::getMemIntrinsicNode(
     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
-    EVT MemVT, MachinePointerInfo PtrInfo, unsigned Alignment,
+    EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
     MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
   if (!Size && MemVT.isScalableVector())
     Size = MemoryLocation::UnknownSize;
@@ -6687,9 +6687,8 @@ SDValue SelectionDAG::getMemIntrinsicNode(
     Size = MemVT.getStoreSize();
 
   MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, Flags, Size, Alignment ? Align(Alignment) : getEVTAlign(MemVT),
-      AAInfo);
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, Flags, Size, Alignment, AAInfo);
 
   return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
 }
@@ -9419,9 +9418,9 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
   return false;
 }
 
-/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
-/// it cannot be inferred.
-unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+/// InferPtrAlignment - Infer alignment of a load / store address. Return None
+/// if it cannot be inferred.
+MaybeAlign SelectionDAG::InferPtrAlign(SDValue Ptr) const {
   // If this is a GlobalAddress + cst, return the alignment.
   const GlobalValue *GV = nullptr;
   int64_t GVOffset = 0;
@@ -9430,9 +9429,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
     KnownBits Known(PtrWidth);
     llvm::computeKnownBits(GV, Known, getDataLayout());
     unsigned AlignBits = Known.countMinTrailingZeros();
-    unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
-    if (Align)
-      return MinAlign(Align, GVOffset);
+    if (AlignBits)
+      return commonAlignment(Align(1 << std::min(31U, AlignBits)), GVOffset);
   }
 
   // If this is a direct reference to a stack slot, use information about the
@@ -9450,12 +9448,10 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
 
   if (FrameIdx != INT_MIN) {
     const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
-    unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
-                                    FrameOffset);
-    return FIInfoAlign;
+    return commonAlignment(MFI.getObjectAlign(FrameIdx), FrameOffset);
   }
 
-  return 0;
+  return None;
 }
 
 /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ed24e004f908..624ee71154f0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4713,10 +4713,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     // This is target intrinsic that touches memory
     AAMDNodes AAInfo;
     I.getAAMetadata(AAInfo);
-    Result = DAG.getMemIntrinsicNode(
-        Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
-        MachinePointerInfo(Info.ptrVal, Info.offset),
-        Info.align ? Info.align->value() : 0, Info.flags, Info.size, AAInfo);
+    Result =
+        DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
+                                MachinePointerInfo(Info.ptrVal, Info.offset),
+                                Info.align, Info.flags, Info.size, AAInfo);
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
@@ -6529,12 +6529,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     Ops[2] = getValue(I.getArgOperand(1));
     Ops[3] = getValue(I.getArgOperand(2));
     Ops[4] = getValue(I.getArgOperand(3));
-    SDValue Result = DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
-                                             DAG.getVTList(MVT::Other), Ops,
-                                             EVT::getIntegerVT(*Context, 8),
-                                             MachinePointerInfo(I.getArgOperand(0)),
-                                             0, /* align */
-                                             Flags);
+    SDValue Result = DAG.getMemIntrinsicNode(
+        ISD::PREFETCH, sdl, DAG.getVTList(MVT::Other), Ops,
+        EVT::getIntegerVT(*Context, 8), MachinePointerInfo(I.getArgOperand(0)),
+        /* align */ None, Flags);
 
     // Chain the prefetch in parallell with any pending loads, to stay out of
     // the way of later optimizations.
@@ -7335,10 +7333,10 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   SDValue Src = getValue(I.getArgOperand(1));
   SDValue Size = getValue(I.getArgOperand(2));
 
-  unsigned DstAlign = DAG.InferPtrAlignment(Dst);
-  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  Align DstAlign = DAG.InferPtrAlign(Dst).valueOrOne();
+  Align SrcAlign = DAG.InferPtrAlign(Src).valueOrOne();
   // DAG::getMemcpy needs Alignment to be defined.
-  Align Alignment = assumeAligned(std::min(DstAlign, SrcAlign));
+  Align Alignment = std::min(DstAlign, SrcAlign);
 
   bool isVol = false;
   SDLoc sdl = getCurSDLoc();
@@ -9494,16 +9492,13 @@ static void tryToElideArgumentCopy(
                   "object size\n");
     return;
   }
-  unsigned RequiredAlignment = AI->getAlignment();
-  if (!RequiredAlignment) {
-    RequiredAlignment = FuncInfo.MF->getDataLayout().getABITypeAlignment(
-        AI->getAllocatedType());
-  }
-  if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
+  Align RequiredAlignment = AI->getAlign().getValueOr(
+      FuncInfo.MF->getDataLayout().getABITypeAlign(AI->getAllocatedType()));
+  if (MFI.getObjectAlign(FixedIndex) < RequiredAlignment) {
     LLVM_DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
                          "greater than stack argument alignment ("
-                      << RequiredAlignment << " vs "
-                      << MFI.getObjectAlignment(FixedIndex) << ")\n");
+                      << RequiredAlignment.value() << " vs "
+                      << MFI.getObjectAlign(FixedIndex).value() << ")\n");
     return;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index dbbcf10be5a7..6626210e9185 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -3708,12 +3708,11 @@ bool SelectionDAGISel::isOrEquivalentToAdd(const SDNode *N) const {
   // Detect when "or" is used to add an offset to a stack object.
   if (auto *FN = dyn_cast<FrameIndexSDNode>(N->getOperand(0))) {
     MachineFrameInfo &MFI = MF->getFrameInfo();
-    unsigned A = MFI.getObjectAlignment(FN->getIndex());
-    assert(isPowerOf2_32(A) && "Unexpected alignment");
+    Align A = MFI.getObjectAlign(FN->getIndex());
     int32_t Off = C->getSExtValue();
     // If the alleged offset fits in the zero bits guaranteed by
     // the alignment, then this or is really an add.
-    return (Off >= 0) && (((A - 1) & Off) == unsigned(Off));
+    return (Off >= 0) && (((A.value() - 1) & Off) == unsigned(Off));
   }
   return false;
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3f0c6443211e..e51555239054 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2762,9 +2762,9 @@ void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
                                                    unsigned Depth) const {
   assert(isa<FrameIndexSDNode>(Op) && "expected FrameIndex");
 
-  if (unsigned Align = DAG.InferPtrAlignment(Op)) {
+  if (MaybeAlign Alignment = DAG.InferPtrAlign(Op)) {
     // The low bits are known zero if the pointer is aligned.
-    Known.Zero.setLowBits(Log2_32(Align));
+    Known.Zero.setLowBits(Log2(*Alignment));
   }
 }
 
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 9d4fdc6b624c..5ed5e8a46b08 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -1290,8 +1290,8 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
           SortedSlots[J] = -1;
           LLVM_DEBUG(dbgs() << "Merging #" << FirstSlot << " and slots #"
                             << SecondSlot << " together.\n");
-          unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
-                                           MFI->getObjectAlignment(SecondSlot));
+          Align MaxAlignment = std::max(MFI->getObjectAlign(FirstSlot),
+                                        MFI->getObjectAlign(SecondSlot));
 
           assert(MFI->getObjectSize(FirstSlot) >=
                  MFI->getObjectSize(SecondSlot) &&
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 7ae758323280..3cc5d30ebad7 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -74,7 +74,7 @@ namespace {
     SmallVector<SmallVector<MachineMemOperand *, 8>, 16> SSRefs;
 
     // OrigAlignments - Alignments of stack objects before coloring.
-    SmallVector<unsigned, 16> OrigAlignments;
+    SmallVector<Align, 16> OrigAlignments;
 
     // OrigSizes - Sizess of stack objects before coloring.
     SmallVector<unsigned, 16> OrigSizes;
@@ -227,7 +227,7 @@ void StackSlotColoring::InitializeSlots() {
       continue;
 
     SSIntervals.push_back(&li);
-    OrigAlignments[FI] = MFI->getObjectAlignment(FI);
+    OrigAlignments[FI] = MFI->getObjectAlign(FI);
     OrigSizes[FI]      = MFI->getObjectSize(FI);
 
     auto StackID = MFI->getStackID(FI);
@@ -309,9 +309,9 @@ int StackSlotColoring::ColorSlot(LiveInterval *li) {
   // Change size and alignment of the allocated slot. If there are multiple
   // objects sharing the same slot, then make sure the size and alignment
   // are large enough for all.
-  unsigned Align = OrigAlignments[FI];
-  if (!Share || Align > MFI->getObjectAlignment(Color))
-    MFI->setObjectAlignment(Color, Align);
+  Align Alignment = OrigAlignments[FI];
+  if (!Share || Alignment > MFI->getObjectAlign(Color))
+    MFI->setObjectAlignment(Color, Alignment);
   int64_t Size = OrigSizes[FI];
   if (!Share || Size > MFI->getObjectSize(Color))
     MFI->setObjectSize(Color, Size);
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index a98445a2295e..50f1ca3fe3df 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -349,8 +349,8 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate(
     MemoryBufferRef ChildBufferRef(ChildBufferInfo.first,
                                    ChildBufferInfo.second);
 
-    if (auto Err =
-            L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef), VModuleKey()))
+    if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false),
+                         VModuleKey()))
       return Err;
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 3be1381652a1..9a868a6fbac3 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -29,14 +29,6 @@ using namespace llvm::orc;
 
 namespace {
 
-/// Add a reference to the __dso_handle global to the given module.
-/// Returns a reference to the __dso_handle IR decl.
-GlobalVariable *addDSOHandleDecl(Module &M) {
-  auto DSOHandleTy = StructType::create(M.getContext(), "lljit.dso_handle");
-  return new GlobalVariable(M, DSOHandleTy, true, GlobalValue::ExternalLinkage,
-                            nullptr, "__dso_handle");
-}
-
 /// Adds helper function decls and wrapper functions that call the helper with
 /// some additional prefix arguments.
 ///
@@ -143,11 +135,10 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
     SymbolMap StdInterposes;
 
     StdInterposes[Mangle("__lljit.platform_support_instance")] =
-        JITEvaluatedSymbol(pointerToJITTargetAddress(this), JITSymbolFlags());
+        JITEvaluatedSymbol(pointerToJITTargetAddress(this),
+                           JITSymbolFlags::Exported);
     StdInterposes[Mangle("__lljit.cxa_atexit_helper")] = JITEvaluatedSymbol(
         pointerToJITTargetAddress(registerAtExitHelper), JITSymbolFlags());
-    StdInterposes[Mangle("__lljit.run_atexits_helper")] = JITEvaluatedSymbol(
-        pointerToJITTargetAddress(runAtExitsHelper), JITSymbolFlags());
 
     cantFail(
         J.getMainJITDylib().define(absoluteSymbols(std::move(StdInterposes))));
@@ -159,6 +150,14 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
 
   /// Adds a module that defines the __dso_handle global.
   Error setupJITDylib(JITDylib &JD) {
+
+    // Add per-jitdylib standard interposes.
+    MangleAndInterner Mangle(getExecutionSession(), J.getDataLayout());
+    SymbolMap PerJDInterposes;
+    PerJDInterposes[Mangle("__lljit.run_atexits_helper")] = JITEvaluatedSymbol(
+        pointerToJITTargetAddress(runAtExitsHelper), JITSymbolFlags());
+    cantFail(JD.define(absoluteSymbols(std::move(PerJDInterposes))));
+
     auto Ctx = std::make_unique<LLVMContext>();
     auto M = std::make_unique<Module>("__standard_lib", *Ctx);
     M->setDataLayout(J.getDataLayout());
@@ -168,9 +167,23 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
         *M, Int64Ty, true, GlobalValue::ExternalLinkage,
         ConstantInt::get(Int64Ty, reinterpret_cast<uintptr_t>(&JD)),
         "__dso_handle");
-    DSOHandle->setVisibility(GlobalValue::HiddenVisibility);
+    DSOHandle->setVisibility(GlobalValue::DefaultVisibility);
     DSOHandle->setInitializer(
         ConstantInt::get(Int64Ty, pointerToJITTargetAddress(&JD)));
+
+    auto *GenericIRPlatformSupportTy =
+        StructType::create(*Ctx, "lljit.GenericLLJITIRPlatformSupport");
+
+    auto *PlatformInstanceDecl = new GlobalVariable(
+        *M, GenericIRPlatformSupportTy, true, GlobalValue::ExternalLinkage,
+        nullptr, "__lljit.platform_support_instance");
+
+    auto *VoidTy = Type::getVoidTy(*Ctx);
+    addHelperAndWrapper(
+        *M, "__lljit_run_atexits", FunctionType::get(VoidTy, {}, false),
+        GlobalValue::HiddenVisibility, "__lljit.run_atexits_helper",
+        {PlatformInstanceDecl, DSOHandle});
+
     return J.addIRModule(JD, ThreadSafeModule(std::move(M), std::move(Ctx)));
   }
 
@@ -316,6 +329,16 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
       }
     });
 
+    LLVM_DEBUG({
+      dbgs() << "JITDylib deinit order is [ ";
+      for (auto *JD : DFSLinkOrder)
+        dbgs() << "\"" << JD->getName() << "\" ";
+      dbgs() << "]\n";
+      dbgs() << "Looking up deinit functions:\n";
+      for (auto &KV : LookupSymbols)
+        dbgs() << "  \"" << KV.first->getName() << "\": " << KV.second << "\n";
+    });
+
     auto LookupResult = Platform::lookupInitSymbols(ES, LookupSymbols);
 
     if (!LookupResult)
@@ -387,11 +410,19 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
 
   static void registerAtExitHelper(void *Self, void (*F)(void *), void *Ctx,
                                    void *DSOHandle) {
+    LLVM_DEBUG({
+      dbgs() << "Registering atexit function " << (void *)F << " for JD "
+             << (*static_cast<JITDylib **>(DSOHandle))->getName() << "\n";
+    });
     static_cast<GenericLLVMIRPlatformSupport *>(Self)->AtExitMgr.registerAtExit(
         F, Ctx, DSOHandle);
   }
 
   static void runAtExitsHelper(void *Self, void *DSOHandle) {
+    LLVM_DEBUG({
+      dbgs() << "Running atexit functions for JD "
+             << (*static_cast<JITDylib **>(DSOHandle))->getName() << "\n";
+    });
     static_cast<GenericLLVMIRPlatformSupport *>(Self)->AtExitMgr.runAtExits(
         DSOHandle);
   }
@@ -410,8 +441,6 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
         *M, GenericIRPlatformSupportTy, true, GlobalValue::ExternalLinkage,
         nullptr, "__lljit.platform_support_instance");
 
-    auto *DSOHandleDecl = addDSOHandleDecl(*M);
-
     auto *Int8Ty = Type::getInt8Ty(*Ctx);
     auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT);
     auto *VoidTy = Type::getVoidTy(*Ctx);
@@ -423,14 +452,9 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
         *M, "__cxa_atexit",
         FunctionType::get(IntTy, {AtExitCallbackPtrTy, BytePtrTy, BytePtrTy},
                           false),
-        GlobalValue::HiddenVisibility, "__lljit.cxa_atexit_helper",
+        GlobalValue::DefaultVisibility, "__lljit.cxa_atexit_helper",
         {PlatformInstanceDecl});
 
-    addHelperAndWrapper(
-        *M, "__lljit_run_atexits", FunctionType::get(VoidTy, {}, false),
-        GlobalValue::HiddenVisibility, "__lljit.run_atexits_helper",
-        {PlatformInstanceDecl, DSOHandleDecl});
-
     return ThreadSafeModule(std::move(M), std::move(Ctx));
   }
 
@@ -676,7 +700,7 @@ class MachOPlatformSupport : public LLJIT::PlatformSupport {
     auto *DSOHandle =
         new GlobalVariable(M, Int64Ty, true, GlobalValue::ExternalLinkage,
                            ConstantInt::get(Int64Ty, 0), "__dso_handle");
-    DSOHandle->setVisibility(GlobalValue::HiddenVisibility);
+    DSOHandle->setVisibility(GlobalValue::DefaultVisibility);
 
     return cantFail(J.getIRCompileLayer().getCompiler()(M));
   }
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index b8c57533568b..eabaaa203927 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -1196,6 +1196,10 @@ ConstantRange::binaryAnd(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
+  // Use APInt's implementation of AND for single element ranges.
+  if (isSingleElement() && Other.isSingleElement())
+    return {*getSingleElement() & *Other.getSingleElement()};
+
   // TODO: replace this with something less conservative
 
   APInt umin = APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax());
@@ -1207,6 +1211,10 @@ ConstantRange::binaryOr(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
+  // Use APInt's implementation of OR for single element ranges.
+  if (isSingleElement() && Other.isSingleElement())
+    return {*getSingleElement() | *Other.getSingleElement()};
+
   // TODO: replace this with something less conservative
 
   APInt umax = APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin());
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index a31a91766ebe..b0b9af9ff573 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4791,6 +4791,42 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
     break;
   }
+  case Intrinsic::matrix_multiply:
+  case Intrinsic::matrix_transpose:
+  case Intrinsic::matrix_columnwise_load:
+  case Intrinsic::matrix_columnwise_store: {
+    ConstantInt *NumRows;
+    ConstantInt *NumColumns;
+    VectorType *TypeToCheck;
+    switch (ID) {
+    case Intrinsic::matrix_multiply:
+      NumRows = cast<ConstantInt>(Call.getArgOperand(2));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
+      TypeToCheck = cast<VectorType>(Call.getType());
+      break;
+    case Intrinsic::matrix_transpose:
+      NumRows = cast<ConstantInt>(Call.getArgOperand(1));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(2));
+      TypeToCheck = cast<VectorType>(Call.getType());
+      break;
+    case Intrinsic::matrix_columnwise_load:
+      NumRows = cast<ConstantInt>(Call.getArgOperand(2));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(3));
+      TypeToCheck = cast<VectorType>(Call.getType());
+      break;
+    case Intrinsic::matrix_columnwise_store:
+      NumRows = cast<ConstantInt>(Call.getArgOperand(3));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
+      TypeToCheck = cast<VectorType>(Call.getArgOperand(0)->getType());
+      break;
+    default:
+      llvm_unreachable("unexpected intrinsic");
+    }
+    Assert(TypeToCheck->getNumElements() ==
+               NumRows->getZExtValue() * NumColumns->getZExtValue(),
+           "result of a matrix operation does not fit in the returned vector");
+    break;
+  }
   };
 }
 
diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp
index 098b3d8f8dd0..7879e2ef651c 100644
--- a/llvm/lib/Object/ObjectFile.cpp
+++ b/llvm/lib/Object/ObjectFile.cpp
@@ -108,14 +108,17 @@ Triple ObjectFile::makeTriple() const {
     setARMSubArch(TheTriple);
 
   // TheTriple defaults to ELF, and COFF doesn't have an environment:
-  // the best we can do here is indicate that it is mach-o.
-  if (isMachO())
+  // something we can do here is indicate that it is mach-o.
+  if (isMachO()) {
     TheTriple.setObjectFormat(Triple::MachO);
-
-  if (isCOFF()) {
+  } else if (isCOFF()) {
     const auto COFFObj = cast<COFFObjectFile>(this);
     if (COFFObj->getArch() == Triple::thumb)
       TheTriple.setTriple("thumbv7-windows");
+  } else if (isXCOFF()) {
+    // XCOFF implies AIX.
+    TheTriple.setOS(Triple::AIX);
+    TheTriple.setObjectFormat(Triple::XCOFF);
   }
 
   return TheTriple;
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index b48c377debda..71e21b90344d 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1065,10 +1065,13 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     return;
   }
 
-  support::endian::write<uint32_t>(OS, Section.Bucket->size(),
-                                   ELFT::TargetEndianness);
-  support::endian::write<uint32_t>(OS, Section.Chain->size(),
-                                   ELFT::TargetEndianness);
+  support::endian::write<uint32_t>(
+      OS, Section.NBucket.getValueOr(llvm::yaml::Hex64(Section.Bucket->size())),
+      ELFT::TargetEndianness);
+  support::endian::write<uint32_t>(
+      OS, Section.NChain.getValueOr(llvm::yaml::Hex64(Section.Chain->size())),
+      ELFT::TargetEndianness);
+
   for (uint32_t Val : *Section.Bucket)
     support::endian::write<uint32_t>(OS, Val, ELFT::TargetEndianness);
   for (uint32_t Val : *Section.Chain)
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 7221d9b5736a..5adcb25dcec4 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1091,6 +1091,13 @@ static void sectionMapping(IO &IO, ELFYAML::HashSection &Section) {
   IO.mapOptional("Bucket", Section.Bucket);
   IO.mapOptional("Chain", Section.Chain);
   IO.mapOptional("Size", Section.Size);
+
+  // obj2yaml does not dump these fields. They can be used to override nchain
+  // and nbucket values for creating broken sections.
+  assert(!IO.outputting() ||
+         (!Section.NBucket.hasValue() && !Section.NChain.hasValue()));
+  IO.mapOptional("NChain", Section.NChain);
+  IO.mapOptional("NBucket", Section.NBucket);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::NoteSection &Section) {
diff --git a/llvm/lib/Support/ELFAttributeParser.cpp b/llvm/lib/Support/ELFAttributeParser.cpp
index 93be0535d1b9..df955cdf5d30 100644
--- a/llvm/lib/Support/ELFAttributeParser.cpp
+++ b/llvm/lib/Support/ELFAttributeParser.cpp
@@ -217,7 +217,7 @@ Error ELFAttributeParser::parse(ArrayRef<uint8_t> section,
 
     if (sectionLength < 4 || cursor.tell() - 4 + sectionLength > section.size())
       return createStringError(errc::invalid_argument,
-                               "invalid subsection length " +
+                               "invalid section length " +
                                    Twine(sectionLength) + " at offset 0x" +
                                    utohexstr(cursor.tell() - 4));
 
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index dfe81d7e2833..a68b30a546c8 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -280,7 +280,7 @@ bool Process::FileDescriptorIsDisplayed(int fd) {
 #endif
 }
 
-static unsigned getColumns(int FileID) {
+static unsigned getColumns() {
   // If COLUMNS is defined in the environment, wrap to that many columns.
   if (const char *ColumnsStr = std::getenv("COLUMNS")) {
     int Columns = std::atoi(ColumnsStr);
@@ -288,31 +288,23 @@ static unsigned getColumns(int FileID) {
       return Columns;
   }
 
-  unsigned Columns = 0;
-
-#if defined(HAVE_SYS_IOCTL_H) && defined(HAVE_TERMIOS_H) \
-  && !(defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE))
-  // Try to determine the width of the terminal.
-  struct winsize ws;
-  if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
-    Columns = ws.ws_col;
-#endif
-
-  return Columns;
+  // We used to call ioctl TIOCGWINSZ to determine the width. It is considered
+  // unuseful.
+  return 0;
 }
 
 unsigned Process::StandardOutColumns() {
   if (!StandardOutIsDisplayed())
     return 0;
 
-  return getColumns(1);
+  return getColumns();
 }
 
 unsigned Process::StandardErrColumns() {
   if (!StandardErrIsDisplayed())
     return 0;
 
-  return getColumns(2);
+  return getColumns();
 }
 
 #ifdef HAVE_TERMINFO
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 10a540f8bfa6..17b13f6f96fb 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -216,6 +216,24 @@ AArch64FrameLowering::getStackIDForScalableVectors() const {
   return TargetStackID::SVEVector;
 }
 
+/// Returns the size of the fixed object area (allocated next to sp on entry)
+/// On Win64 this may include a var args area and an UnwindHelp object for EH.
+static unsigned getFixedObjectSize(const MachineFunction &MF,
+                                   const AArch64FunctionInfo *AFI, bool IsWin64,
+                                   bool IsFunclet) {
+  if (!IsWin64 || IsFunclet) {
+    // Only Win64 uses fixed objects, and then only for the function (not
+    // funclets)
+    return 0;
+  } else {
+    // Var args are stored here in the primary function.
+    const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
+    // To support EH funclets we allocate an UnwindHelp object
+    const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
+    return alignTo(VarArgsArea + UnwindHelpObject, 16);
+  }
+}
+
 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
 static StackOffset getSVEStackSize(const MachineFunction &MF) {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -995,10 +1013,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  // Var args are accounted for in the containing function, so don't
-  // include them for funclets.
-  unsigned FixedObject = (IsWin64 && !IsFunclet) ?
-                         alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
@@ -1029,32 +1044,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     ++MBBI;
   }
 
-  // The code below is not applicable to funclets. We have emitted all the SEH
-  // opcodes that we needed to emit.  The FP and BP belong to the containing
-  // function.
-  if (IsFunclet) {
-    if (NeedsWinCFI) {
-      HasWinCFI = true;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-
-    // SEH funclets are passed the frame pointer in X1.  If the parent
-    // function uses the base register, then the base register is used
-    // directly, and is not retrieved from X1.
-    if (F.hasPersonalityFn()) {
-      EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
-      if (isAsynchronousEHPersonality(Per)) {
-        BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
-            .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup);
-        MBB.addLiveIn(AArch64::X1);
-      }
-    }
-
-    return;
-  }
-
-  if (HasFP) {
+  // For funclets the FP belongs to the containing function.
+  if (!IsFunclet && HasFP) {
     // Only set up FP if we actually need to.
     int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
 
@@ -1197,7 +1188,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
   // Allocate space for the rest of the frame.
   if (NumBytes) {
-    const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+    // Alignment is required for the parent frame, not the funclet
+    const bool NeedsRealignment =
+        !IsFunclet && RegInfo->needsStackRealignment(MF);
     unsigned scratchSPReg = AArch64::SP;
 
     if (NeedsRealignment) {
@@ -1250,7 +1243,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // FIXME: Clarify FrameSetup flags here.
   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
   // needed.
-  if (RegInfo->hasBasePointer(MF)) {
+  // For funclets the BP belongs to the containing function.
+  if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
                      false);
     if (NeedsWinCFI) {
@@ -1267,6 +1261,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // SEH funclets are passed the frame pointer in X1.  If the parent
+  // function uses the base register, then the base register is used
+  // directly, and is not retrieved from X1.
+  if (IsFunclet && F.hasPersonalityFn()) {
+    EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
+    if (isAsynchronousEHPersonality(Per)) {
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
+          .addReg(AArch64::X1)
+          .setMIFlag(MachineInstr::FrameSetup);
+      MBB.addLiveIn(AArch64::X1);
+    }
+  }
+
   if (needsFrameMoves) {
     const DataLayout &TD = MF.getDataLayout();
     const int StackGrowth = isTargetDarwin(MF)
@@ -1485,10 +1492,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  // Var args are accounted for in the containing function, so don't
-  // include them for funclets.
-  unsigned FixedObject =
-      (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
   uint64_t AfterCSRPopSize = ArgumentPopSize;
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
@@ -1714,7 +1718,9 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset)
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+  unsigned FixedObject =
+      getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
   unsigned FPAdjust = isTargetDarwin(MF)
                         ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
   return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
@@ -2093,8 +2099,8 @@ static void computeCalleeSaveRegisterPairs(
       FixupDone = true;
       ByteOffset -= 8;
       assert(ByteOffset % 16 == 0);
-      assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
-      MFI.setObjectAlignment(RPI.FrameIdx, 16);
+      assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
+      MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
     }
 
     int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
@@ -2584,12 +2590,12 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
   // Then process all callee saved slots.
   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
     // Make sure to align the last callee save slot.
-    MFI.setObjectAlignment(MaxCSFrameIndex, 16U);
+    MFI.setObjectAlignment(MaxCSFrameIndex, Align(16));
 
     // Assign offsets to the callee save slots.
     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
       Offset += MFI.getObjectSize(I);
-      Offset = alignTo(Offset, MFI.getObjectAlignment(I));
+      Offset = alignTo(Offset, MFI.getObjectAlign(I));
       if (AssignOffsets)
         Assign(I, -Offset);
     }
@@ -2611,15 +2617,15 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
 
   // Allocate all SVE locals and spills
   for (unsigned FI : ObjectsToAllocate) {
-    unsigned Align = MFI.getObjectAlignment(FI);
+    Align Alignment = MFI.getObjectAlign(FI);
     // FIXME: Given that the length of SVE vectors is not necessarily a power of
     // two, we'd need to align every object dynamically at runtime if the
     // alignment is larger than 16. This is not yet supported.
-    if (Align > 16)
+    if (Alignment > Align(16))
       report_fatal_error(
           "Alignment of scalable vectors > 16 bytes is not yet supported");
 
-    Offset = alignTo(Offset + MFI.getObjectSize(FI), Align);
+    Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
     if (AssignOffsets)
       Assign(FI, -Offset);
   }
@@ -2667,9 +2673,14 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     ++MBBI;
 
   // Create an UnwindHelp object.
-  int UnwindHelpFI =
-      MFI.CreateStackObject(/*size*/8, /*alignment*/16, false);
+  // The UnwindHelp object is allocated at the start of the fixed object area
+  int64_t FixedObject =
+      getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
+  int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
+                                           /*SPOffset*/ -FixedObject,
+                                           /*IsImmutable=*/false);
   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
+
   // We need to store -2 into the UnwindHelp object at the start of the
   // function.
   DebugLoc DL;
@@ -3081,10 +3092,14 @@ int AArch64FrameLowering::getFrameIndexReferencePreferSP(
     const MachineFunction &MF, int FI, unsigned &FrameReg,
     bool IgnoreSPUpdates) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
-                    << MFI.getObjectOffset(FI) << "\n");
-  FrameReg = AArch64::SP;
-  return MFI.getObjectOffset(FI);
+  if (IgnoreSPUpdates) {
+    LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
+                      << MFI.getObjectOffset(FI) << "\n");
+    FrameReg = AArch64::SP;
+    return MFI.getObjectOffset(FI);
+  }
+
+  return getFrameIndexReference(MF, FI, FrameReg);
 }
 
 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 698189e14c21..61b78acad3f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -55,6 +55,9 @@ struct ImageDimIntrinsicInfo {
 };
 const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
 
+const ImageDimIntrinsicInfo *getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode,
+                                                               unsigned Dim);
+
 } // end AMDGPU namespace
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 9e1fb426116b..c9b065cdd8d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1340,7 +1340,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   }
 
   // TODO: Check this in verifier.
-  assert(!IsTexFail || DMaskLanes >= 1 && "should have legalized this");
+  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
 
   bool GLC = false;
   bool SLC = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c88569604227..5ee3267822b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2075,7 +2075,7 @@ bool AMDGPULegalizerInfo::legalizeFMad(
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
-  HelperBuilder.setMBB(*MI.getParent());
+  HelperBuilder.setInstr(MI);
   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
 }
 
@@ -3722,27 +3722,32 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     const ConstantFP *ConstantLod;
     const int LodIdx = AddrIdx + NumVAddrs - 1;
 
-    // FIXME: This isn't the cleanest way to handle this, but it's the easiest
-    // option the current infrastructure gives. We really should be changing the
-    // base intrinsic opcode, but the current searchable tables only gives us
-    // the final MI opcode. Eliminate the register here, and track with an
-    // immediate 0 so the final selection will know to do the opcode change.
     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
-        MI.getOperand(LodIdx).ChangeToImmediate(0);
+        // Set new opcode to _lz variant of _l, and change the intrinsic ID.
+        ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
+          LZMappingInfo->LZ, ImageDimIntr->Dim);
+
+        // The starting indexes should remain in the same place.
+        --NumVAddrs;
         --CorrectedNumVAddrs;
+
+        MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
+          static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
+        MI.RemoveOperand(LodIdx);
       }
     }
   }
 
   // Optimize _mip away, when 'lod' is zero
-  if (const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
-        AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+  if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
     int64_t ConstantLod;
     const int LodIdx = AddrIdx + NumVAddrs - 1;
 
     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
       if (ConstantLod == 0) {
+        // TODO: Change intrinsic opcode and remove operand instead or replacing
+        // it with 0, as the _L to _LZ handling is done above.
         MI.getOperand(LodIdx).ChangeToImmediate(0);
         --CorrectedNumVAddrs;
       }
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 8a439425062e..c4dbb65ee5b8 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -817,6 +817,11 @@ def ImageDimIntrinsicTable : GenericTable {
   let PrimaryKeyEarlyOut = 1;
 }
 
+def getImageDimInstrinsicByBaseOpcode : SearchIndex {
+  let Table = ImageDimIntrinsicTable;
+  let Key = ["BaseOpcode", "Dim"];
+}
+
 foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
                            AMDGPUImageDimAtomicIntrinsics) in {
   def : ImageDimIntrinsicInfo<intr>;
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index d9aa9ebe878d..d2fe3c9f93c6 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -35,15 +35,15 @@ int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
   int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
 
   for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(i));
     OffsetBytes += MFI.getObjectSize(i);
     // Each register holds 4 bytes, so we must always align the offset to at
     // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = alignTo(OffsetBytes, 4);
+    OffsetBytes = alignTo(OffsetBytes, Align(4));
   }
 
   if (FI != -1)
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(FI));
 
   return OffsetBytes / (getStackWidth(MF) * 4);
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 8a3dea8152f1..3f36150d2a3c 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -503,16 +503,16 @@ bool isUncondBranchOpcode(int Opc) {
 // the ArmARM.
 
 
-inline static unsigned getARMVPTBlockMask(unsigned NumInsts) {
+inline static ARM::PredBlockMask getARMVPTBlockMask(unsigned NumInsts) {
   switch (NumInsts) {
   case 1:
-    return ARMVCC::T;
+    return ARM::PredBlockMask::T;
   case 2:
-    return ARMVCC::TT;
+    return ARM::PredBlockMask::TT;
   case 3:
-    return ARMVCC::TTT;
+    return ARM::PredBlockMask::TTT;
   case 4:
-    return ARMVCC::TTTT;
+    return ARM::PredBlockMask::TTTT;
   default:
     break;
   };
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index f424f22464e5..3c6f446580bb 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1191,8 +1191,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
     // Only multiples of 4 are allowed for the offset, so the frame object
     // alignment must be at least 4.
     MachineFrameInfo &MFI = MF->getFrameInfo();
-    if (MFI.getObjectAlignment(FI) < 4)
-      MFI.setObjectAlignment(FI, 4);
+    if (MFI.getObjectAlign(FI) < Align(4))
+      MFI.setObjectAlignment(FI, Align(4));
     Base = CurDAG->getTargetFrameIndex(
         FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
@@ -1215,9 +1215,9 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
       if (RHSC * 4 < MFI.getObjectSize(FI)) {
         // For LHS+RHS to result in an offset that's a multiple of 4 the object
         // indexed by the LHS must be 4-byte aligned.
-        if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4)
-          MFI.setObjectAlignment(FI, 4);
-        if (MFI.getObjectAlignment(FI) >= 4) {
+        if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlign(FI) < Align(4))
+          MFI.setObjectAlignment(FI, Align(4));
+        if (MFI.getObjectAlign(FI) >= Align(4)) {
           Base = CurDAG->getTargetFrameIndex(
               FI, TLI->getPointerTy(CurDAG->getDataLayout()));
           OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
@@ -3420,8 +3420,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       // Set the alignment of the frame object to 4, to avoid having to generate
       // more than one ADD
       MachineFrameInfo &MFI = MF->getFrameInfo();
-      if (MFI.getObjectAlignment(FI) < 4)
-        MFI.setObjectAlignment(FI, 4);
+      if (MFI.getObjectAlign(FI) < Align(4))
+        MFI.setObjectAlignment(FI, Align(4));
       CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
                            CurDAG->getTargetConstant(0, dl, MVT::i32));
       return;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6d03126b2db0..860dfce691a3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2272,9 +2272,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
-          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
-                                     MachinePointerInfo(),
-                                     DAG.InferPtrAlignment(AddArg));
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
+                          DAG.InferPtrAlign(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index b9572ef3e9d0..4ccf62759f5a 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1198,7 +1198,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
       if (isVCTP(Divergent->MI)) {
         // The vctp will be removed, so the size of the vpt block needs to be
         // modified.
-        uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
+        uint64_t Size = (uint64_t)getARMVPTBlockMask(Block.size() - 1);
         Block.getVPST()->getOperand(0).setImm(Size);
         LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
       } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
@@ -1227,7 +1227,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
                                           InsertAt->getDebugLoc(),
                                           TII->get(ARM::MVE_VPST));
-        MIB.addImm(getARMVPTBlockMask(Size));
+        MIB.addImm((uint64_t)getARMVPTBlockMask(Size));
         LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
         LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
         LoLoop.ToRemove.insert(Block.getVPST());
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 80450983e513..9304d8339193 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6980,6 +6980,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   //    ITx   -> x100    (ITT -> 0100, ITE -> 1100)
   //    ITxy  -> xy10    (e.g. ITET -> 1010)
   //    ITxyz -> xyz1    (e.g. ITEET -> 1101)
+  // Note: See the ARM::PredBlockMask enum in
+  //   /lib/Target/ARM/Utils/ARMBaseInfo.h
   if (Mnemonic == "it" || Mnemonic.startswith("vpt") ||
       Mnemonic.startswith("vpst")) {
     SMLoc Loc = Mnemonic == "it"  ? SMLoc::getFromPointer(NameLoc.getPointer() + 2) :
diff --git a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index c8b725f339e2..2de9829480db 100644
--- a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -34,30 +34,30 @@ using namespace llvm;
 #define DEBUG_TYPE "arm-mve-vpt"
 
 namespace {
-  class MVEVPTBlock : public MachineFunctionPass {
-  public:
-    static char ID;
-    const Thumb2InstrInfo *TII;
-    const TargetRegisterInfo *TRI;
+class MVEVPTBlock : public MachineFunctionPass {
+public:
+  static char ID;
+  const Thumb2InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
 
-    MVEVPTBlock() : MachineFunctionPass(ID) {}
+  MVEVPTBlock() : MachineFunctionPass(ID) {}
 
-    bool runOnMachineFunction(MachineFunction &Fn) override;
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    MachineFunctionProperties getRequiredProperties() const override {
-      return MachineFunctionProperties().set(
-          MachineFunctionProperties::Property::NoVRegs);
-    }
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
 
-    StringRef getPassName() const override {
-      return "MVE VPT block insertion pass";
-    }
+  StringRef getPassName() const override {
+    return "MVE VPT block insertion pass";
+  }
 
-  private:
-    bool InsertVPTBlocks(MachineBasicBlock &MBB);
-  };
+private:
+  bool InsertVPTBlocks(MachineBasicBlock &MBB);
+};
 
-  char MVEVPTBlock::ID = 0;
+char MVEVPTBlock::ID = 0;
 
 } // end anonymous namespace
 
@@ -94,24 +94,184 @@ static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI,
   return &*CmpMI;
 }
 
+static ARM::PredBlockMask ExpandBlockMask(ARM::PredBlockMask BlockMask,
+                                          ARMVCC::VPTCodes Kind) {
+  using PredBlockMask = ARM::PredBlockMask;
+  assert(Kind != ARMVCC::None && "Cannot expand mask with 'None'");
+  assert(countTrailingZeros((unsigned)BlockMask) != 0 &&
+         "Mask is already full");
+
+  auto ChooseMask = [&](PredBlockMask AddedThen, PredBlockMask AddedElse) {
+    return (Kind == ARMVCC::Then) ? AddedThen : AddedElse;
+  };
+
+  switch (BlockMask) {
+  case PredBlockMask::T:
+    return ChooseMask(PredBlockMask::TT, PredBlockMask::TE);
+  case PredBlockMask::TT:
+    return ChooseMask(PredBlockMask::TTT, PredBlockMask::TTE);
+  case PredBlockMask::TE:
+    return ChooseMask(PredBlockMask::TET, PredBlockMask::TEE);
+  case PredBlockMask::TTT:
+    return ChooseMask(PredBlockMask::TTTT, PredBlockMask::TTTE);
+  case PredBlockMask::TTE:
+    return ChooseMask(PredBlockMask::TTET, PredBlockMask::TTEE);
+  case PredBlockMask::TET:
+    return ChooseMask(PredBlockMask::TETT, PredBlockMask::TETE);
+  case PredBlockMask::TEE:
+    return ChooseMask(PredBlockMask::TEET, PredBlockMask::TEEE);
+  default:
+    llvm_unreachable("Unknown Mask");
+  }
+}
+
+// Advances Iter past a block of predicated instructions.
+// Returns true if it successfully skipped the whole block of predicated
+// instructions. Returns false when it stopped early (due to MaxSteps), or if
+// Iter didn't point to a predicated instruction.
+static bool StepOverPredicatedInstrs(MachineBasicBlock::instr_iterator &Iter,
+                                     MachineBasicBlock::instr_iterator EndIter,
+                                     unsigned MaxSteps,
+                                     unsigned &NumInstrsSteppedOver) {
+  ARMVCC::VPTCodes NextPred = ARMVCC::None;
+  unsigned PredReg;
+  NumInstrsSteppedOver = 0;
+
+  while (Iter != EndIter) {
+    NextPred = getVPTInstrPredicate(*Iter, PredReg);
+    assert(NextPred != ARMVCC::Else &&
+           "VPT block pass does not expect Else preds");
+    if (NextPred == ARMVCC::None || MaxSteps == 0)
+      break;
+    --MaxSteps;
+    ++Iter;
+    ++NumInstrsSteppedOver;
+  };
+
+  return NumInstrsSteppedOver != 0 &&
+         (NextPred == ARMVCC::None || Iter == EndIter);
+}
+
+// Returns true if at least one instruction in the range [Iter, End) defines
+// or kills VPR.
+static bool IsVPRDefinedOrKilledByBlock(MachineBasicBlock::iterator Iter,
+                                        MachineBasicBlock::iterator End) {
+  for (; Iter != End; ++Iter)
+    if (Iter->definesRegister(ARM::VPR) || Iter->killsRegister(ARM::VPR))
+      return true;
+  return false;
+}
+
+// Given an iterator (Iter) that points at an instruction with a "Then"
+// predicate, tries to create the largest block of continuous predicated
+// instructions possible, and returns the VPT Block Mask of that block.
+//
+// This will try to perform some minor optimization in order to maximize the
+// size of the block.
+static ARM::PredBlockMask
+CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
+               MachineBasicBlock::instr_iterator EndIter,
+               SmallVectorImpl<MachineInstr *> &DeadInstructions) {
+  MachineBasicBlock::instr_iterator BlockBeg = Iter;
+  (void)BlockBeg;
+  assert(getVPTInstrPredicate(*Iter) == ARMVCC::Then &&
+         "Expected a Predicated Instruction");
+
+  LLVM_DEBUG(dbgs() << "VPT block created for: "; Iter->dump());
+
+  unsigned BlockSize;
+  StepOverPredicatedInstrs(Iter, EndIter, 4, BlockSize);
+
+  LLVM_DEBUG(for (MachineBasicBlock::instr_iterator AddedInstIter =
+                      std::next(BlockBeg);
+                  AddedInstIter != Iter; ++AddedInstIter) {
+    dbgs() << "  adding: ";
+    AddedInstIter->dump();
+  });
+
+  // Generate the initial BlockMask
+  ARM::PredBlockMask BlockMask = getARMVPTBlockMask(BlockSize);
+
+  // Remove VPNOTs while there's still room in the block, so we can make the
+  // largest block possible.
+  ARMVCC::VPTCodes CurrentPredicate = ARMVCC::Then;
+  while (BlockSize < 4 && Iter != EndIter &&
+         Iter->getOpcode() == ARM::MVE_VPNOT) {
+
+    // Try to skip all of the predicated instructions after the VPNOT, stopping
+    // after (4 - BlockSize). If we can't skip them all, stop.
+    unsigned ElseInstCnt = 0;
+    MachineBasicBlock::instr_iterator VPNOTBlockEndIter = std::next(Iter);
+    if (!StepOverPredicatedInstrs(VPNOTBlockEndIter, EndIter, (4 - BlockSize),
+                                  ElseInstCnt))
+      break;
+
+    // Check if this VPNOT can be removed or not: It can only be removed if at
+    // least one of the predicated instruction that follows it kills or sets
+    // VPR.
+    if (!IsVPRDefinedOrKilledByBlock(Iter, VPNOTBlockEndIter))
+      break;
+
+    LLVM_DEBUG(dbgs() << "  removing VPNOT: "; Iter->dump(););
+
+    // Record the new size of the block
+    BlockSize += ElseInstCnt;
+    assert(BlockSize <= 4 && "Block is too large!");
+
+    // Record the VPNot to remove it later.
+    DeadInstructions.push_back(&*Iter);
+    ++Iter;
+
+    // Replace "then" by "elses" in the block until we find an instruction that
+    // defines VPR, then after that leave everything to "t".
+    // Note that we are using "Iter" to iterate over the block so we can update
+    // it at the same time.
+    bool ChangeToElse = (CurrentPredicate == ARMVCC::Then);
+    for (; Iter != VPNOTBlockEndIter; ++Iter) {
+      // Find the register in which the predicate is
+      int OpIdx = findFirstVPTPredOperandIdx(*Iter);
+      assert(OpIdx != -1);
+
+      // Update the mask + change the predicate to an else if needed.
+      if (ChangeToElse) {
+        // Change the predicate and update the mask
+        Iter->getOperand(OpIdx).setImm(ARMVCC::Else);
+        BlockMask = ExpandBlockMask(BlockMask, ARMVCC::Else);
+        // Reset back to a "then" predicate if this instruction defines VPR.
+        if (Iter->definesRegister(ARM::VPR))
+          ChangeToElse = false;
+      } else
+        BlockMask = ExpandBlockMask(BlockMask, ARMVCC::Then);
+
+      LLVM_DEBUG(dbgs() << "  adding: "; Iter->dump());
+    }
+
+    CurrentPredicate =
+        (CurrentPredicate == ARMVCC::Then ? ARMVCC::Else : ARMVCC::Then);
+  }
+  return BlockMask;
+}
+
 bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
   bool Modified = false;
   MachineBasicBlock::instr_iterator MBIter = Block.instr_begin();
   MachineBasicBlock::instr_iterator EndIter = Block.instr_end();
 
+  SmallVector<MachineInstr *, 4> DeadInstructions;
+
   while (MBIter != EndIter) {
     MachineInstr *MI = &*MBIter;
     unsigned PredReg = 0;
-    DebugLoc dl = MI->getDebugLoc();
+    DebugLoc DL = MI->getDebugLoc();
 
     ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
 
     // The idea of the predicate is that None, Then and Else are for use when
     // handling assembly language: they correspond to the three possible
     // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
-    // from assembly source or disassembled from object code, you expect to see
-    // a mixture whenever there's a long VPT block. But in code generation, we
-    // hope we'll never generate an Else as input to this pass.
+    // from assembly source or disassembled from object code, you expect to
+    // see a mixture whenever there's a long VPT block. But in code
+    // generation, we hope we'll never generate an Else as input to this pass.
     assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
 
     if (Pred == ARMVCC::None) {
@@ -119,42 +279,25 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
       continue;
     }
 
-    LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump());
-    int VPTInstCnt = 1;
-    ARMVCC::VPTCodes NextPred;
-
-    // Look at subsequent instructions, checking if they can be in the same VPT
-    // block.
-    ++MBIter;
-    while (MBIter != EndIter && VPTInstCnt < 4) {
-      NextPred = getVPTInstrPredicate(*MBIter, PredReg);
-      assert(NextPred != ARMVCC::Else &&
-             "VPT block pass does not expect Else preds");
-      if (NextPred != Pred)
-        break;
-      LLVM_DEBUG(dbgs() << "  adding : "; MBIter->dump());
-      ++VPTInstCnt;
-      ++MBIter;
-    };
-
-    unsigned BlockMask = getARMVPTBlockMask(VPTInstCnt);
+    ARM::PredBlockMask BlockMask =
+        CreateVPTBlock(MBIter, EndIter, DeadInstructions);
 
-    // Search back for a VCMP that can be folded to create a VPT, or else create
-    // a VPST directly
+    // Search back for a VCMP that can be folded to create a VPT, or else
+    // create a VPST directly
     MachineInstrBuilder MIBuilder;
     unsigned NewOpcode;
-    MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode);
-    if (VCMP) {
+    LLVM_DEBUG(dbgs() << "  final block mask: " << (unsigned)BlockMask << "\n");
+    if (MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode)) {
       LLVM_DEBUG(dbgs() << "  folding VCMP into VPST: "; VCMP->dump());
-      MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode));
-      MIBuilder.addImm(BlockMask);
+      MIBuilder = BuildMI(Block, MI, DL, TII->get(NewOpcode));
+      MIBuilder.addImm((uint64_t)BlockMask);
       MIBuilder.add(VCMP->getOperand(1));
       MIBuilder.add(VCMP->getOperand(2));
       MIBuilder.add(VCMP->getOperand(3));
       VCMP->eraseFromParent();
     } else {
-      MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST));
-      MIBuilder.addImm(BlockMask);
+      MIBuilder = BuildMI(Block, MI, DL, TII->get(ARM::MVE_VPST));
+      MIBuilder.addImm((uint64_t)BlockMask);
     }
 
     finalizeBundle(
@@ -162,6 +305,15 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
 
     Modified = true;
   }
+
+  // Erase all dead instructions
+  for (MachineInstr *DeadMI : DeadInstructions) {
+    if (DeadMI->isInsideBundle())
+      DeadMI->eraseFromBundle();
+    else
+      DeadMI->eraseFromParent();
+  }
+
   return Modified;
 }
 
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 3e8e77a9db1f..583a09163f4e 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -74,6 +74,10 @@ ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
 int findFirstVPTPredOperandIdx(const MachineInstr &MI);
 ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI,
                                       unsigned &PredReg);
+inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
+  unsigned PredReg;
+  return getVPTInstrPredicate(MI, PredReg);
 }
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index a7578ee172ca..b2e434fbd78a 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -91,25 +91,35 @@ namespace ARMVCC {
     Then,
     Else
   };
-
-  enum VPTMaskValue {
-    T     =  8, // 0b1000
-    TT    =  4, // 0b0100
-    TE    = 12, // 0b1100
-    TTT   =  2, // 0b0010
-    TTE   =  6, // 0b0110
-    TEE   = 10, // 0b1010
-    TET   = 14, // 0b1110
-    TTTT  =  1, // 0b0001
-    TTTE  =  3, // 0b0011
-    TTEE  =  5, // 0b0101
-    TTET  =  7, // 0b0111
-    TEEE  =  9, // 0b1001
-    TEET  = 11, // 0b1011
-    TETT  = 13, // 0b1101
-    TETE  = 15  // 0b1111
+} // namespace ARMVCC
+
+namespace ARM {
+  /// Mask values for IT and VPT Blocks, to be used by MCOperands.
+  /// Note that this is different from the "real" encoding used by the
+  /// instructions. In this encoding, the lowest set bit indicates the end of
+  /// the encoding, and above that, "1" indicates an else, while "0" indicates
+  /// a then.
+  ///   Tx = x100
+  ///   Txy = xy10
+  ///   Txyz = xyz1
+  enum class PredBlockMask {
+    T = 0b1000,
+    TT = 0b0100,
+    TE = 0b1100,
+    TTT = 0b0010,
+    TTE = 0b0110,
+    TEE = 0b1110,
+    TET = 0b1010,
+    TTTT = 0b0001,
+    TTTE = 0b0011,
+    TTEE = 0b0111,
+    TTET = 0b0101,
+    TEEE = 0b1111,
+    TEET = 0b1101,
+    TETT = 0b1001,
+    TETE = 0b1011
   };
-}
+} // namespace ARM
 
 inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) {
   switch (CC) {
diff --git a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index ebd060ce503e..1e4030b84bc1 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -330,7 +330,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
     case PS_fi: {
       int FI = op(1).getIndex();
       int Off = op(2).getImm();
-      unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
+      unsigned A = MFI.getObjectAlign(FI).value() + std::abs(Off);
       unsigned L = countTrailingZeros(A);
       RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
       RC.fill(0, L, BT::BitValue::Zero);
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 9c39d0bf844f..a9cfbdc3c6fc 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -645,15 +645,15 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
       auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
       for (int i = HMFI.getFirstNamedArgFrameIndex(),
                e = HMFI.getLastNamedArgFrameIndex(); i >= e; --i) {
-        int ObjSize = MFI.getObjectSize(i);
-        int ObjAlign = MFI.getObjectAlignment(i);
+        uint64_t ObjSize = MFI.getObjectSize(i);
+        Align ObjAlign = MFI.getObjectAlign(i);
 
         // Determine the kind of load/store that should be used.
         unsigned LDOpc, STOpc;
-        int OpcodeChecker = ObjAlign;
+        uint64_t OpcodeChecker = ObjAlign.value();
 
         // Handle cases where alignment of an object is > its size.
-        if (ObjSize < ObjAlign) {
+        if (ObjAlign > ObjSize) {
           if (ObjSize <= 1)
             OpcodeChecker = 1;
           else if (ObjSize <= 2)
@@ -702,17 +702,17 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
         while (Count < LoadStoreCount) {
           // Load the value of the named argument on stack.
           BuildMI(MBB, InsertPt, dl, HII.get(LDOpc), RegUsed)
-            .addReg(SP)
-            .addImm(RegisterSavedAreaSizePlusPadding +
-                    ObjAlign * Count + NumBytes)
-            .setMIFlag(MachineInstr::FrameSetup);
+              .addReg(SP)
+              .addImm(RegisterSavedAreaSizePlusPadding +
+                      ObjAlign.value() * Count + NumBytes)
+              .setMIFlag(MachineInstr::FrameSetup);
 
           // Store it below the register saved area plus padding.
           BuildMI(MBB, InsertPt, dl, HII.get(STOpc))
-            .addReg(SP)
-            .addImm(ObjAlign * Count + NumBytes)
-            .addReg(RegUsed)
-            .setMIFlag(MachineInstr::FrameSetup);
+              .addReg(SP)
+              .addImm(ObjAlign.value() * Count + NumBytes)
+              .addReg(RegUsed)
+              .setMIFlag(MachineInstr::FrameSetup);
 
           Count++;
         }
@@ -1520,8 +1520,8 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
     unsigned S = MFI.getObjectSize(i);
     // Reduce the alignment to at most 8. This will require unaligned vector
     // stores if they happen here.
-    unsigned A = std::max(MFI.getObjectAlignment(i), 8U);
-    MFI.setObjectAlignment(i, 8);
+    Align A = std::max(MFI.getObjectAlign(i), Align(8));
+    MFI.setObjectAlignment(i, Align(8));
     LFS = alignTo(LFS+S, A);
     MFI.mapLocalFrameObject(i, -static_cast<int64_t>(LFS));
     DealignSlots.insert(i);
@@ -1934,11 +1934,11 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   bool NeedsAligna = needsAligna(MF);
 
   unsigned Size = HRI.getSpillSize(Hexagon::HvxVRRegClass);
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   unsigned StoreOpc;
 
-  auto UseAligned = [&] (unsigned NeedAlign, unsigned HasAlign) {
+  auto UseAligned = [&](Align NeedAlign, Align HasAlign) {
     return !NeedsAligna && (NeedAlign <= HasAlign);
   };
 
@@ -1986,11 +1986,11 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
   bool NeedsAligna = needsAligna(MF);
 
   unsigned Size = HRI.getSpillSize(Hexagon::HvxVRRegClass);
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   unsigned LoadOpc;
 
-  auto UseAligned = [&] (unsigned NeedAlign, unsigned HasAlign) {
+  auto UseAligned = [&](Align NeedAlign, Align HasAlign) {
     return !NeedsAligna && (NeedAlign <= HasAlign);
   };
 
@@ -2030,8 +2030,8 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
 
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   bool UseAligned = !NeedsAligna && (NeedAlign <= HasAlign);
   unsigned StoreOpc = UseAligned ? Hexagon::V6_vS32b_ai
                                  : Hexagon::V6_vS32Ub_ai;
@@ -2060,8 +2060,8 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
   Register DstR = MI->getOperand(0).getReg();
   int FI = MI->getOperand(1).getIndex();
 
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   bool UseAligned = !NeedsAligna && (NeedAlign <= HasAlign);
   unsigned LoadOpc = UseAligned ? Hexagon::V6_vL32b_ai
                                 : Hexagon::V6_vL32Ub_ai;
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 1d1744a49b3a..08967a534bf9 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -266,7 +266,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                     << "spOffset   : " << spOffset << "\n"
                     << "stackSize  : " << stackSize << "\n"
                     << "alignment  : "
-                    << MF.getFrameInfo().getObjectAlignment(FrameIndex)
+                    << MF.getFrameInfo().getObjectAlign(FrameIndex).value()
                     << "\n");
 
   eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 83074ca22768..2ef583c0a799 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -218,11 +218,10 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
 // covered by the vector op. Otherwise, it returns 1.
 static unsigned CanMergeParamLoadStoresStartingAt(
     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
-    const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
-  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
+    const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
 
   // Can't vectorize if param alignment is not sufficient.
-  if (AccessSize > ParamAlignment)
+  if (ParamAlignment < AccessSize)
     return 1;
   // Can't vectorize if offset is not aligned.
   if (Offsets[Idx] & (AccessSize - 1))
@@ -282,7 +281,7 @@ enum ParamVectorizationFlags {
 static SmallVector<ParamVectorizationFlags, 16>
 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
                      const SmallVectorImpl<uint64_t> &Offsets,
-                     unsigned ParamAlignment) {
+                     Align ParamAlignment) {
   // Set vector size to match ValueVTs and mark all elements as
   // scalars by default.
   SmallVector<ParamVectorizationFlags, 16> VectorInfo;
@@ -1243,7 +1242,7 @@ NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
 
 std::string NVPTXTargetLowering::getPrototype(
     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
     ImmutableCallSite CS) const {
   auto PtrVT = getPointerTy(DL);
 
@@ -1279,8 +1278,8 @@ std::string NVPTXTargetLowering::getPrototype(
       O << ".param .b" << PtrVT.getSizeInBits() << " _";
     } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
                retTy->isIntegerTy(128)) {
-      O << ".param .align " << retAlignment << " .b8 _["
-        << DL.getTypeAllocSize(retTy) << "]";
+      O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
+        << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
     } else {
       llvm_unreachable("Unknown return type");
     }
@@ -1353,16 +1352,16 @@ std::string NVPTXTargetLowering::getPrototype(
   return O.str();
 }
 
-unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
-                                                   ImmutableCallSite CS,
-                                                   Type *Ty, unsigned Idx,
-                                                   const DataLayout &DL) const {
+Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+                                                ImmutableCallSite CS, Type *Ty,
+                                                unsigned Idx,
+                                                const DataLayout &DL) const {
   if (!CS) {
     // CallSite is zero, fallback to ABI type alignment
-    return DL.getABITypeAlignment(Ty);
+    return DL.getABITypeAlign(Ty);
   }
 
-  unsigned Align = 0;
+  unsigned Alignment = 0;
   const Value *DirectCallee = CS.getCalledFunction();
 
   if (!DirectCallee) {
@@ -1374,8 +1373,8 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
     // With bitcast'd call targets, the instruction will be the call
     if (isa<CallInst>(CalleeI)) {
       // Check if we have call alignment metadata
-      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
-        return Align;
+      if (getAlign(*cast<CallInst>(CalleeI), Idx, Alignment))
+        return Align(Alignment);
 
       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
       // Ignore any bitcast instructions
@@ -1397,12 +1396,12 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
   // Check for function alignment information if we found that the
   // ultimate target is a Function
   if (DirectCallee)
-    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
-      return Align;
+    if (getAlign(*cast<Function>(DirectCallee), Idx, Alignment))
+      return Align(Alignment);
 
   // Call is indirect or alignment information is not available, fall back to
   // the ABI type alignment
-  return DL.getABITypeAlignment(Ty);
+  return DL.getABITypeAlign(Ty);
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -1450,15 +1449,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       SmallVector<EVT, 16> VTs;
       SmallVector<uint64_t, 16> Offsets;
       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
-      unsigned ArgAlign =
-          getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+      Align ArgAlign = getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
       unsigned AllocSize = DL.getTypeAllocSize(Ty);
       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       bool NeedAlign; // Does argument declaration specify alignment?
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // declare .param .align <align> .b8 .param<n>[<size>];
         SDValue DeclareParamOps[] = {
-            Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+            Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
             DAG.getConstant(paramCount, dl, MVT::i32),
             DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
@@ -1539,8 +1537,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           // Adjust type of the store op if we've extended the scalar
           // return value.
           EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
-          unsigned EltAlign =
-              NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
+          MaybeAlign EltAlign;
+          if (NeedAlign)
+            EltAlign = commonAlignment(ArgAlign, Offsets[j]);
 
           Chain = DAG.getMemIntrinsicNode(
               Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
@@ -1604,10 +1603,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  DAG.getConstant(paramCount, dl, MVT::i32),
                                  DAG.getConstant(curOffset, dl, MVT::i32),
                                  theVal, InFlag };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
-                                      CopyParamOps, elemtype,
-                                      MachinePointerInfo(), /* Align */ 0,
-                                      MachineMemOperand::MOStore);
+      Chain = DAG.getMemIntrinsicNode(
+          NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
+          MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);
 
       InFlag = Chain.getValue(1);
     }
@@ -1615,7 +1613,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
-  unsigned retAlignment = 0;
+  MaybeAlign retAlignment = None;
 
   // Handle Result
   if (Ins.size() > 0) {
@@ -1644,11 +1642,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InFlag = Chain.getValue(1);
     } else {
       retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+      assert(retAlignment && "retAlignment is guaranteed to be set");
       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue DeclareRetOps[] = { Chain,
-                                  DAG.getConstant(retAlignment, dl, MVT::i32),
-                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
+      SDValue DeclareRetOps[] = {
+          Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
+          DAG.getConstant(resultsz / 8, dl, MVT::i32),
+          DAG.getConstant(0, dl, MVT::i32), InFlag};
       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
                           DeclareRetOps);
       InFlag = Chain.getValue(1);
@@ -1754,7 +1753,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
     assert(VTs.size() == Ins.size() && "Bad value decomposition");
 
-    unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+    Align RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
 
     SmallVector<EVT, 6> LoadVTs;
@@ -1770,7 +1769,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       bool needTruncate = false;
       EVT TheLoadType = VTs[i];
       EVT EltType = Ins[i].VT;
-      unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+      Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
       if (ExtendIntegerRetVal) {
         TheLoadType = MVT::i32;
         EltType = MVT::i32;
@@ -2545,7 +2544,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
       assert(VTs.size() > 0 && "Unexpected empty type.");
       auto VectorInfo =
-          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
+          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
 
       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
       int VecIdx = -1; // Index of the first element of the current vector.
@@ -2664,7 +2663,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
 
   auto VectorInfo = VectorizePTXValueVTs(
-      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
+      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
 
   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
   // 32-bits are sign extended or zero extended, depending on whether
@@ -2716,10 +2715,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       // Adjust type of load/store op if we've extended the scalar
       // return value.
       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
-      Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
-                                      StoreOperands, TheStoreType,
-                                      MachinePointerInfo(), /* Align */ 1,
-                                      MachineMemOperand::MOStore);
+      Chain = DAG.getMemIntrinsicNode(
+          Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
+          MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
       // Cleanup vector state.
       StoreOperands.clear();
     }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 546fe49808e2..986ad70ed80c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -491,8 +491,7 @@ class NVPTXTargetLowering : public TargetLowering {
 
   std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
-                           unsigned retAlignment,
-                           ImmutableCallSite CS) const;
+                           MaybeAlign retAlignment, ImmutableCallSite CS) const;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -579,8 +578,8 @@ class NVPTXTargetLowering : public TargetLowering {
                           SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  unsigned getArgumentAlignment(SDValue Callee, ImmutableCallSite CS, Type *Ty,
-                                unsigned Idx, const DataLayout &DL) const;
+  Align getArgumentAlignment(SDValue Callee, ImmutableCallSite CS, Type *Ty,
+                             unsigned Idx, const DataLayout &DL) const;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 49c2790d7caa..d6e0bc285b3a 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -60,6 +60,13 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
                                          createPPCLEDisassembler);
 }
 
+static DecodeStatus decodeCondBrTarget(MCInst &Inst, unsigned Imm,
+                                       uint64_t /*Address*/,
+                                       const void * /*Decoder*/) {
+  Inst.addOperand(MCOperand::createImm(SignExtend32<14>(Imm)));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodePCRel24BranchTarget(MCInst &Inst, unsigned Imm,
                                               uint64_t Addr,
                                               const void *Decoder) {
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 523bbe7dd367..a233dc9b3474 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4213,7 +4213,7 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
     // because it is translated to r31 or r1 + slot + offset. We won't know the
     // slot number until the stack frame is finalized.
     const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
-    unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
+    unsigned SlotAlign = MFI.getObjectAlign(FI->getIndex()).value();
     if ((SlotAlign % Val) != 0)
       return false;
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 6f596f8ef867..3223efbdc661 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2417,8 +2417,7 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-  if (Align >= 4)
+  if (MFI.getObjectAlign(FrameIdx) >= Align(4))
     return;
 
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
@@ -2750,7 +2749,7 @@ SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
   SDValue Ops[] = { GA, Reg };
   return DAG.getMemIntrinsicNode(
       PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
-      MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()), None,
       MachineMemOperand::MOLoad);
 }
 
@@ -13768,7 +13767,7 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
         (Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
          (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
 
-  if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() ||
+  if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Vector() ||
       cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
     return SDValue();
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 6c48512d8fb6..3102b9089817 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -764,7 +764,9 @@ def PPCCondBrAsmOperand : AsmOperandClass {
 def condbrtarget : Operand<OtherVT> {
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getCondBrEncoding";
+  let DecoderMethod = "decodeCondBrTarget";
   let ParserMatchClass = PPCCondBrAsmOperand;
+  let OperandType = "OPERAND_PCREL";
 }
 def abscondbrtarget : Operand<OtherVT> {
   let PrintMethod = "printAbsBranchOperand";
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f66d06c20e37..66f943c634fe 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -10,55 +10,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "RISCVISelDAGToDAG.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "RISCV.h"
-#include "RISCVTargetMachine.h"
 #include "Utils/RISCVMatInt.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "riscv-isel"
 
-// RISCV-specific code to select RISCV machine instructions for
-// SelectionDAG operations.
-namespace {
-class RISCVDAGToDAGISel final : public SelectionDAGISel {
-  const RISCVSubtarget *Subtarget = nullptr;
-
-public:
-  explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
-      : SelectionDAGISel(TargetMachine) {}
-
-  StringRef getPassName() const override {
-    return "RISCV DAG->DAG Pattern Instruction Selection";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    Subtarget = &MF.getSubtarget<RISCVSubtarget>();
-    return SelectionDAGISel::runOnMachineFunction(MF);
-  }
-
-  void PostprocessISelDAG() override;
-
-  void Select(SDNode *Node) override;
-
-  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
-                                    std::vector<SDValue> &OutOps) override;
-
-  bool SelectAddrFI(SDValue Addr, SDValue &Base);
-
-// Include the pieces autogenerated from the target description.
-#include "RISCVGenDAGISel.inc"
-
-private:
-  void doPeepholeLoadStoreADDI();
-};
-}
-
 void RISCVDAGToDAGISel::PostprocessISelDAG() {
   doPeepholeLoadStoreADDI();
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
new file mode 100644
index 000000000000..dcf733ec3675
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -0,0 +1,56 @@
+//===---- RISCVISelDAGToDAG.h - A dag to dag inst selector for RISCV ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the RISCV target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVISELDAGTODAG_H
+#define LLVM_LIB_TARGET_RISCV_RISCVISELDAGTODAG_H
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+// RISCV-specific code to select RISCV machine instructions for
+// SelectionDAG operations.
+namespace llvm {
+class RISCVDAGToDAGISel : public SelectionDAGISel {
+  const RISCVSubtarget *Subtarget = nullptr;
+
+public:
+  explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
+      : SelectionDAGISel(TargetMachine) {}
+
+  StringRef getPassName() const override {
+    return "RISCV DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<RISCVSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  void PostprocessISelDAG() override;
+
+  void Select(SDNode *Node) override;
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+  bool SelectAddrFI(SDValue Addr, SDValue &Base);
+
+// Include the pieces autogenerated from the target description.
+#include "RISCVGenDAGISel.inc"
+
+private:
+  void doPeepholeLoadStoreADDI();
+};
+}
+
+#endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 929169dd62d9..f76abf22e4db 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -129,6 +129,10 @@ class RISCVTargetLowering : public TargetLowering {
     return ISD::SIGN_EXTEND;
   }
 
+  ISD::NodeType getExtendForAtomicCmpSwapArg() const override {
+    return ISD::SIGN_EXTEND;
+  }
+
   bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
     if (DAG.getMachineFunction().getFunction().hasMinSize())
       return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index b4354e852194..b186e32e788d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -79,7 +79,6 @@ template <> bool ConcreteRegion<MachineLoop>::isLoop() const { return true; }
 class RegionInfo {
   const MachineLoopInfo &MLI;
   const WebAssemblyExceptionInfo &WEI;
-  std::vector<const Region *> Regions;
   DenseMap<const MachineLoop *, std::unique_ptr<Region>> LoopMap;
   DenseMap<const WebAssemblyException *, std::unique_ptr<Region>> ExceptionMap;
 
@@ -93,7 +92,14 @@ class RegionInfo {
     const auto *WE = WEI.getExceptionFor(MBB);
     if (!ML && !WE)
       return nullptr;
-    if ((ML && !WE) || (ML && WE && ML->getNumBlocks() < WE->getNumBlocks())) {
+    // We determine subregion relationship by domination of their headers, i.e.,
+    // if region A's header dominates region B's header, B is a subregion of A.
+    // WebAssemblyException contains BBs in all its subregions (loops or
+    // exceptions), but MachineLoop may not, because MachineLoop does not contain
+    // BBs that don't have a path to its header even if they are dominated by
+    // its header. So here we should use WE->contains(ML->getHeader()), but not
+    // ML->contains(WE->getHeader()).
+    if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) {
       // If the smallest region containing MBB is a loop
       if (LoopMap.count(ML))
         return LoopMap[ML].get();
@@ -368,6 +374,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
     const Region *Region = RI.getRegionFor(&MBB);
 
     if (Region && &MBB == Region->getHeader()) {
+      // Region header.
       if (Region->isLoop()) {
         // Loop header. The loop predecessor should be sorted above, and the
         // other predecessors should be backedges below.
@@ -377,7 +384,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
               "Loop header predecessors must be loop predecessors or "
               "backedges");
       } else {
-        // Not a loop header. All predecessors should be sorted above.
+        // Exception header. All predecessors should be sorted above.
         for (auto Pred : MBB.predecessors())
           assert(Pred->getNumber() < MBB.getNumber() &&
                  "Non-loop-header predecessors should be topologically sorted");
@@ -386,7 +393,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
              "Regions should be declared at most once.");
 
     } else {
-      // Not a loop header. All predecessors should be sorted above.
+      // Not a region header. All predecessors should be sorted above.
       for (auto Pred : MBB.predecessors())
         assert(Pred->getNumber() < MBB.getNumber() &&
                "Non-loop-header predecessors should be topologically sorted");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 89265a3a9520..c8878a48b243 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -344,6 +344,21 @@ static std::string getSignature(FunctionType *FTy) {
   return Sig;
 }
 
+static void markAsImported(Function *F) {
+  // Tell the linker that this function is expected to be imported from the
+  // 'env' module.
+  if (!F->hasFnAttribute("wasm-import-module")) {
+    llvm::AttrBuilder B;
+    B.addAttribute("wasm-import-module", "env");
+    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+  }
+  if (!F->hasFnAttribute("wasm-import-name")) {
+    llvm::AttrBuilder B;
+    B.addAttribute("wasm-import-name", F->getName());
+    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+  }
+}
+
 // Returns __cxa_find_matching_catch_N function, where N = NumClauses + 2.
 // This is because a landingpad instruction contains two more arguments, a
 // personality function and a cleanup bit, and __cxa_find_matching_catch_N
@@ -360,6 +375,7 @@ WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M,
   Function *F = Function::Create(
       FTy, GlobalValue::ExternalLinkage,
       "__cxa_find_matching_catch_" + Twine(NumClauses + 2), &M);
+  markAsImported(F);
   FindMatchingCatches[NumClauses] = F;
   return F;
 }
@@ -469,6 +485,7 @@ Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) {
                                         CalleeFTy->isVarArg());
   Function *F =
       Function::Create(FTy, GlobalValue::ExternalLinkage, "__invoke_" + Sig, M);
+  markAsImported(F);
   InvokeWrappers[Sig] = F;
   return F;
 }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 9a832c1bb16d..c2eb78bd056d 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -136,9 +136,11 @@ class X86AsmBackend : public MCAsmBackend {
 
   bool needAlign(MCObjectStreamer &OS) const;
   bool needAlignInst(const MCInst &Inst) const;
+  bool allowAutoPaddingForInst(const MCInst &Inst, MCObjectStreamer &OS) const;
   MCInst PrevInst;
   MCBoundaryAlignFragment *PendingBoundaryAlign = nullptr;
   std::pair<MCFragment *, size_t> PrevInstPosition;
+  bool AllowAutoPaddingForInst;
 
 public:
   X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
@@ -538,13 +540,8 @@ static size_t getSizeForInstFragment(const MCFragment *F) {
   }
 }
 
-/// Check if the instruction operand needs to be aligned. Padding is disabled
-/// before intruction which may be rewritten by linker(e.g. TLSCALL).
+/// Check if the instruction operand needs to be aligned.
 bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
-  // Linker may rewrite the instruction with variant symbol operand.
-  if (hasVariantSymbol(Inst))
-    return false;
-
   const MCInstrDesc &InstDesc = MCII->get(Inst.getOpcode());
   return (InstDesc.isConditionalBranch() &&
           (AlignBranchType & X86::AlignBranchJcc)) ||
@@ -558,31 +555,53 @@ bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
           (AlignBranchType & X86::AlignBranchIndirect));
 }
 
-/// Insert BoundaryAlignFragment before instructions to align branches.
-void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
-                                       const MCInst &Inst) {
-  if (!needAlign(OS))
-    return;
+/// Return true if we can insert NOP or prefixes automatically before the
+/// the instruction to be emitted.
+bool X86AsmBackend::allowAutoPaddingForInst(const MCInst &Inst,
+                                            MCObjectStreamer &OS) const {
+  if (hasVariantSymbol(Inst))
+    // Linker may rewrite the instruction with variant symbol operand(e.g.
+    // TLSCALL).
+    return false;
 
   if (hasInterruptDelaySlot(PrevInst))
     // If this instruction follows an interrupt enabling instruction with a one
     // instruction delay, inserting a nop would change behavior.
-    return;
+    return false;
 
   if (isPrefix(PrevInst, *MCII))
-    // If this instruction follows a prefix, inserting a nop would change
+    // If this instruction follows a prefix, inserting a nop/prefix would change
     // semantic.
-    return;
+    return false;
+
+  if (isPrefix(Inst, *MCII))
+    // If this instruction is a prefix, inserting a prefix would change
+    // semantic.
+    return false;
 
   if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
     // If this instruction follows any data, there is no clear
-    // instruction boundary, inserting a nop would change semantic.
+    // instruction boundary, inserting a nop/prefix would change semantic.
+    return false;
+
+  return true;
+}
+
+/// Insert BoundaryAlignFragment before instructions to align branches.
+void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
+                                         const MCInst &Inst) {
+  AllowAutoPaddingForInst = allowAutoPaddingForInst(Inst, OS);
+
+  if (!needAlign(OS))
     return;
 
   if (!isMacroFused(PrevInst, Inst))
     // Macro fusion doesn't happen indeed, clear the pending.
     PendingBoundaryAlign = nullptr;
 
+  if (!AllowAutoPaddingForInst)
+    return;
+
   if (PendingBoundaryAlign &&
       OS.getCurrentFragment()->getPrevNode() == PendingBoundaryAlign) {
     // Macro fusion actually happens and there is no other fragment inserted
@@ -617,12 +636,14 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
 
 /// Set the last fragment to be aligned for the BoundaryAlignFragment.
 void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {
-  if (!needAlign(OS))
-    return;
-
   PrevInst = Inst;
   MCFragment *CF = OS.getCurrentFragment();
   PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+  if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
+    F->setAllowAutoPadding(AllowAutoPaddingForInst);
+
+  if (!needAlign(OS))
+    return;
 
   if (!needAlignInst(Inst) || !PendingBoundaryAlign)
     return;
@@ -827,12 +848,6 @@ static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
   return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
 }
 
-
-static bool shouldAddPrefix(const MCInst &Inst, const MCInstrInfo &MCII) {
-  // Linker may rewrite the instruction with variant symbol operand.
-  return !hasVariantSymbol(Inst);
-}
-
 static unsigned getRemainingPrefixSize(const MCInst &Inst,
                                        const MCSubtargetInfo &STI,
                                        MCCodeEmitter &Emitter) {
@@ -856,7 +871,7 @@ static unsigned getRemainingPrefixSize(const MCInst &Inst,
 bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
                                             MCCodeEmitter &Emitter,
                                             unsigned &RemainingSize) const {
-  if (!shouldAddPrefix(RF.getInst(), *MCII))
+  if (!RF.getAllowAutoPadding())
     return false;
   // If the instruction isn't fully relaxed, shifting it around might require a
   // larger value for one of the fixups then can be encoded.  The outer loop
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index f394fa403a1c..0e8e94e311d4 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2012,7 +2012,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // Skip the saved EBP.
       return Offset + SlotSize + FPDelta;
     } else {
-      assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+      assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
       return Offset + StackSize;
     }
   } else if (TRI->needsStackRealignment(MF)) {
@@ -2020,7 +2020,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // Skip the saved EBP.
       return Offset + SlotSize + FPDelta;
     } else {
-      assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+      assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
       return Offset + StackSize;
     }
     // FIXME: Support tail calls
@@ -3203,7 +3203,7 @@ struct X86FrameSortingObject {
   bool IsValid = false;         // true if we care about this Object.
   unsigned ObjectIndex = 0;     // Index of Object into MFI list.
   unsigned ObjectSize = 0;      // Size of Object in bytes.
-  unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+  Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
   unsigned ObjectNumUses = 0;   // Object static number of uses.
 };
 
@@ -3288,7 +3288,7 @@ void X86FrameLowering::orderFrameObjects(
   for (auto &Obj : ObjectsToAllocate) {
     SortingObjects[Obj].IsValid = true;
     SortingObjects[Obj].ObjectIndex = Obj;
-    SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
+    SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj);
     // Set the size.
     int ObjectSize = MFI.getObjectSize(Obj);
     if (ObjectSize == 0)
@@ -3381,7 +3381,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
       int FrameIndex = H.CatchObj.FrameIndex;
       if (FrameIndex != INT_MAX) {
         // Ensure alignment.
-        unsigned Align = MFI.getObjectAlignment(FrameIndex);
+        unsigned Align = MFI.getObjectAlign(FrameIndex).value();
         MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
         MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
         MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index e6116d644f97..2f3d6e809037 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1185,7 +1185,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
         SDVTList VTs = CurDAG->getVTList(MVT::Other);
         SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
         Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
-                                            MPI, /*Align*/ 0,
+                                            MPI, /*Align*/ None,
                                             MachineMemOperand::MOStore);
         if (N->getFlags().hasNoFPExcept()) {
           SDNodeFlags Flags = Store->getFlags();
@@ -1201,9 +1201,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       if (!DstIsSSE) {
         SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
         SDValue Ops[] = {Store, MemTmp};
-        Result =
-            CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
-                                        /*Align*/ 0, MachineMemOperand::MOLoad);
+        Result = CurDAG->getMemIntrinsicNode(
+            X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
+            /*Align*/ None, MachineMemOperand::MOLoad);
         if (N->getFlags().hasNoFPExcept()) {
           SDNodeFlags Flags = Result->getFlags();
           Flags.setNoFPExcept(true);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 45cdfa9450d0..eba4db960f51 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6133,6 +6133,35 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
   return SDValue();
 }
 
+void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+                                   bool Lo, bool Unary) {
+  assert(Mask.empty() && "Expected an empty shuffle mask vector");
+  int NumElts = VT.getVectorNumElements();
+  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+  for (int i = 0; i < NumElts; ++i) {
+    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+    int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+    Pos += (Unary ? 0 : NumElts * (i % 2));
+    Pos += (Lo ? 0 : NumEltsInLane / 2);
+    Mask.push_back(Pos);
+  }
+}
+
+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+/// imposed by AVX and specific to the unary pattern. Example:
+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+                                   bool Lo) {
+  assert(Mask.empty() && "Expected an empty shuffle mask vector");
+  int NumElts = VT.getVectorNumElements();
+  for (int i = 0; i < NumElts; ++i) {
+    int Pos = i / 2;
+    Pos += (Lo ? 0 : NumElts / 2);
+    Mask.push_back(Pos);
+  }
+}
+
 /// Returns a vector_shuffle node for an unpackl operation.
 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                           SDValue V1, SDValue V2) {
@@ -7320,8 +7349,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
 
     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
-    scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
-    scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
+    scaleShuffleMask(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
+    scaleShuffleMask(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
     for (size_t i = 0; i != MaskSize; ++i) {
       if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
         Mask.push_back(SM_SentinelUndef);
@@ -7379,7 +7408,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       if ((NumSubElts % SubMask.size()) == 0) {
         int Scale = NumSubElts / SubMask.size();
         SmallVector<int,64> ScaledSubMask;
-        scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
+        scaleShuffleMask(Scale, SubMask, ScaledSubMask);
         SubMask = ScaledSubMask;
       } else {
         int Scale = SubMask.size() / NumSubElts;
@@ -7522,7 +7551,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
   case ISD::TRUNCATE:
   case X86ISD::VTRUNC: {
     SDValue Src = N.getOperand(0);
-    MVT SrcVT = Src.getSimpleValueType();
+    EVT SrcVT = Src.getValueType();
+    // Truncated source must be a simple vector.
+    if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+        (SrcVT.getScalarSizeInBits() % 8) != 0)
+      return false;
     unsigned NumSrcElts = SrcVT.getVectorNumElements();
     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
     unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
@@ -8100,11 +8133,11 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
 
     // FIXME: 256-bit vector instructions don't require a strict alignment,
     // improve this code to support it better.
-    unsigned RequiredAlign = VT.getSizeInBits()/8;
+    Align RequiredAlign(VT.getSizeInBits() / 8);
     SDValue Chain = LD->getChain();
     // Make sure the stack object alignment is at least 16 or 32.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+    if (DAG.InferPtrAlign(Ptr) < RequiredAlign) {
       if (MFI.isFixedObjectIndex(FI)) {
         // Can't change the alignment. FIXME: It's possible to compute
         // the exact stack offset and reference FI + adjust offset instead.
@@ -8119,9 +8152,9 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
     // Ptr + (Offset & ~15).
     if (Offset < 0)
       return SDValue();
-    if ((Offset % RequiredAlign) & 3)
+    if ((Offset % RequiredAlign.value()) & 3)
       return SDValue();
-    int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+    int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
     if (StartOffset) {
       SDLoc DL(Ptr);
       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -8390,11 +8423,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     if (TLI.isTypeLegal(VecVT)) {
       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
-      SDValue ResNode =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
-                                  LDBase->getPointerInfo(),
-                                  LDBase->getAlignment(),
-                                  MachineMemOperand::MOLoad);
+      SDValue ResNode = DAG.getMemIntrinsicNode(
+          X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
+          LDBase->getAlign(), MachineMemOperand::MOLoad);
       for (auto *LD : Loads)
         if (LD)
           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
@@ -8636,7 +8667,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
-          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+          MaybeAlign Alignment(cast<ConstantPoolSDNode>(CP)->getAlignment());
           SDVTList Tys =
               DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
           SDValue Ops[] = {DAG.getEntryNode(), CP};
@@ -8724,7 +8755,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       SDValue CP =
           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
-      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      MaybeAlign Alignment(cast<ConstantPoolSDNode>(CP)->getAlignment());
 
       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
       SDValue Ops[] = {DAG.getEntryNode(), CP};
@@ -11297,20 +11328,21 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
 
 // X86 has dedicated pack instructions that can handle specific truncation
 // operations: PACKSS and PACKUS.
+// TODO: Add support for matching multiple PACKSS/PACKUS stages.
 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
                                  SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BitSize = VT.getScalarSizeInBits();
-  MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
-  MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
 
-  auto MatchPACK = [&](SDValue N1, SDValue N2) {
+  auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
+    unsigned NumSrcBits = PackVT.getScalarSizeInBits();
+    unsigned NumPackedBits = NumSrcBits - BitSize;
     SDValue VV1 = DAG.getBitcast(PackVT, N1);
     SDValue VV2 = DAG.getBitcast(PackVT, N2);
-    if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
-      APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
+    if (Subtarget.hasSSE41() || BitSize == 8) {
+      APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
           (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
         V1 = VV1;
@@ -11320,8 +11352,8 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
         return true;
       }
     }
-    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
-        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
+        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
       V1 = VV1;
       V2 = VV2;
       SrcVT = PackVT;
@@ -11331,18 +11363,21 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
     return false;
   };
 
+  MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
+  MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
+
   // Try binary shuffle.
   SmallVector<int, 32> BinaryMask;
   createPackShuffleMask(VT, BinaryMask, false);
   if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
-    if (MatchPACK(V1, V2))
+    if (MatchPACK(V1, V2, PackVT))
       return true;
 
   // Try unary shuffle.
   SmallVector<int, 32> UnaryMask;
   createPackShuffleMask(VT, UnaryMask, true);
   if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
-    if (MatchPACK(V1, V1))
+    if (MatchPACK(V1, V1, PackVT))
       return true;
 
   return false;
@@ -16279,7 +16314,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     SmallVector<int, 2> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
       SmallVector<int, 4> PSHUFDMask;
-      scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
+      scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v4i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
@@ -16928,7 +16963,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
   SmallVector<int, 2> Widened256Mask;
   if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
     Widened128Mask.clear();
-    llvm::scaleShuffleMask<int>(2, Widened256Mask, Widened128Mask);
+    llvm::scaleShuffleMask(2, Widened256Mask, Widened128Mask);
   }
 
   // Try to lower to vshuf64x2/vshuf32x4.
@@ -17079,7 +17114,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     SmallVector<int, 2> Repeated128Mask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
       SmallVector<int, 4> PSHUFDMask;
-      scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
+      scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v8i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
@@ -17216,6 +17251,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
     return V;
 
+  // Use dedicated pack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
+    return V;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
@@ -17237,13 +17277,13 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v32 case.
-      return lowerV8I16GeneralSingleInputShuffle(
-          DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+      return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
+                                                 RepeatedMask, Subtarget, DAG);
     }
   }
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
-                                                Zeroable, Subtarget, DAG))
+                                          Zeroable, Subtarget, DAG))
     return Blend;
 
   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
@@ -19214,15 +19254,16 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
 
   unsigned Size = SrcVT.getStoreSize();
+  Align Alignment(Size);
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
-  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
+  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
   MachinePointerInfo MPI =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
-  Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Size);
+  Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
   std::pair<SDValue, SDValue> Tmp =
-      BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Size, DAG);
+      BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
 
   if (IsStrict)
     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -19232,7 +19273,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
 
 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
     EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
-    MachinePointerInfo PtrInfo, unsigned Alignment, SelectionDAG &DAG) const {
+    MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
   // Build the FILD
   SDVTList Tys;
   bool useSSE = isScalarFPTypeInSSEReg(DstVT);
@@ -19525,8 +19566,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
     SDValue VBias = DAG.getMemIntrinsicNode(
         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-        /*Alignment*/ 8, MachineMemOperand::MOLoad);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
+        MachineMemOperand::MOLoad);
 
     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
                              DAG.getBitcast(MVT::v4i64, VBias));
@@ -19705,7 +19746,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
                                   OffsetSlot, MPI.getWithOffset(4), 4);
     std::pair<SDValue, SDValue> Tmp =
-        BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, 8, DAG);
+        BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
     if (IsStrict)
       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
 
@@ -19721,7 +19762,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   }
   SDValue Store =
-      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, 8 /*Align*/);
+      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
   // For i64 source, we need to add the appropriate power of 2 if the input
   // was negative.  This is the same as the optimization in
   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
@@ -19729,9 +19770,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot };
-  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
-                                         MVT::i64, MPI, 8 /*Align*/,
-                                         MachineMemOperand::MOLoad);
+  SDValue Fild =
+      DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
+                              Align(8), MachineMemOperand::MOLoad);
   Chain = Fild.getValue(1);
 
 
@@ -20161,7 +20202,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
     SmallVector<int, 64> Mask;
     int Scale = 64 / OutVT.getScalarSizeInBits();
-    scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
+    scaleShuffleMask(Scale, { 0, 2, 1, 3 }, Mask);
     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
 
     if (DstVT.is256BitVector())
@@ -20682,14 +20723,13 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
     SDValue Ops[] = { Chain, StackPtr };
 
     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
-                                  /*Align*/0, MachineMemOperand::MOLoad);
+                                  /*Align*/ None, MachineMemOperand::MOLoad);
     Chain = Src.getValue(1);
   }
 
   SDValue StoreOps[] = { Chain, Src, StackPtr };
-  Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL,
-                                  DAG.getVTList(MVT::Other), StoreOps,
-                                  DstVT, MPI, /*Align*/0,
+  Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
+                                  StoreOps, DstVT, MPI, /*Align*/ None,
                                   MachineMemOperand::MOStore);
 
   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
@@ -23589,11 +23629,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
                        DAG.getConstant(Align, dl, MVT::i32)};
   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(
-    X86ISD::VAARG_64, dl,
-    VTs, InstOps, MVT::i64,
-    MachinePointerInfo(SV),
-    /*Align=*/0,
-    MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+      X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+      /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
   Chain = VAARG.getValue(1);
 
   // Load the next argument and return it
@@ -25775,10 +25812,10 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   SDValue Ops[] = {Chain, StackSlot};
   Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
                                   DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
-                                  2 /*Align*/, MachineMemOperand::MOStore);
+                                  Align(2), MachineMemOperand::MOStore);
 
   // Load FP Control Word from stack slot
-  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, 2 /*Align*/);
+  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
   Chain = CWD.getValue(1);
 
   // Mask and turn the control bits into a shift for the lookup table.
@@ -28453,7 +28490,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
         SDValue LdOps[] = {Chain, StackPtr};
         SDValue Value =
             DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
-                                    /*Align*/ 0, MachineMemOperand::MOLoad);
+                                    /*Align*/ None, MachineMemOperand::MOLoad);
         Chain = Value.getValue(1);
 
         // Now use an FIST to do the atomic store.
@@ -29864,10 +29901,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         MachinePointerInfo MPI =
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
         SDValue StoreOps[] = { Chain, Result, StackPtr };
-        Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
-                                        DAG.getVTList(MVT::Other), StoreOps,
-                                        MVT::i64, MPI, 0 /*Align*/,
-                                        MachineMemOperand::MOStore);
+        Chain = DAG.getMemIntrinsicNode(
+            X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
+            MPI, None /*Align*/, MachineMemOperand::MOStore);
 
         // Finally load the value back from the stack temporary and return it.
         // This load is not atomic and doesn't need to be.
@@ -33607,7 +33643,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
       // Narrow the repeated mask to create 32-bit element permutes.
       SmallVector<int, 4> WordMask = RepeatedMask;
       if (MaskScalarSizeInBits == 64)
-        scaleShuffleMask<int>(2, RepeatedMask, WordMask);
+        scaleShuffleMask(2, RepeatedMask, WordMask);
 
       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
@@ -34060,7 +34096,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   if (BaseMaskEltSizeInBits > 64) {
     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
     int MaskScale = BaseMaskEltSizeInBits / 64;
-    scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
+    scaleShuffleMask(MaskScale, BaseMask, Mask);
   } else {
     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
   }
@@ -35287,11 +35323,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       if (LN->isSimple()) {
         SDVTList Tys = DAG.getVTList(MVT::v2f64, MVT::Other);
         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-        SDValue VZLoad =
-            DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::f64,
-                                    LN->getPointerInfo(),
-                                    LN->getAlignment(),
-                                    LN->getMemOperand()->getFlags());
+        SDValue VZLoad = DAG.getMemIntrinsicNode(
+            X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::f64, LN->getPointerInfo(),
+            LN->getAlign(), LN->getMemOperand()->getFlags());
         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
         DCI.CombineTo(N.getNode(), Movddup);
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
@@ -35389,11 +35423,10 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         if (LN->isSimple()) {
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-          SDValue BcastLd =
-              DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
-                                      MVT::i16, LN->getPointerInfo(),
-                                      LN->getAlignment(),
-                                      LN->getMemOperand()->getFlags());
+          SDValue BcastLd = DAG.getMemIntrinsicNode(
+              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+              LN->getPointerInfo(), LN->getAlign(),
+              LN->getMemOperand()->getFlags());
           DCI.CombineTo(N.getNode(), BcastLd);
           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
           DCI.recursivelyDeleteUnusedNodes(LN);
@@ -35434,12 +35467,11 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
           SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
           SDValue Ops[] = { LN->getChain(), Ptr };
-          SDValue BcastLd =
-              DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
-                                      MVT::i16,
-                                      LN->getPointerInfo().getWithOffset(Offset),
-                                      MinAlign(LN->getAlignment(), Offset),
-                                      LN->getMemOperand()->getFlags());
+          SDValue BcastLd = DAG.getMemIntrinsicNode(
+              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+              LN->getPointerInfo().getWithOffset(Offset),
+              commonAlignment(LN->getAlign(), Offset),
+              LN->getMemOperand()->getFlags());
           DCI.CombineTo(N.getNode(), BcastLd);
           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
           DCI.recursivelyDeleteUnusedNodes(LN);
@@ -35472,11 +35504,10 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       if (LN->isSimple()) {
         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-        SDValue BcastLd =
-            DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
-                                    MVT::f64, LN->getPointerInfo(),
-                                    LN->getAlignment(),
-                                    LN->getMemOperand()->getFlags());
+        SDValue BcastLd = DAG.getMemIntrinsicNode(
+            X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+            LN->getPointerInfo(), LN->getAlign(),
+            LN->getMemOperand()->getFlags());
         DCI.CombineTo(N.getNode(), BcastLd);
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
         DCI.recursivelyDeleteUnusedNodes(LN);
@@ -36237,12 +36268,10 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     if (LN->isSimple()) {
       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
-                                  VT.getVectorElementType(),
-                                  LN->getPointerInfo(),
-                                  LN->getAlignment(),
-                                  LN->getMemOperand()->getFlags());
+      SDValue VZLoad = DAG.getMemIntrinsicNode(
+          X86ISD::VZEXT_LOAD, dl, Tys, Ops, VT.getVectorElementType(),
+          LN->getPointerInfo(), LN->getAlign(),
+          LN->getMemOperand()->getFlags());
       DCI.CombineTo(N, VZLoad);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
       DCI.recursivelyDeleteUnusedNodes(LN);
@@ -38184,7 +38213,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
     if ((NumSrcElts % Mask.size()) == 0) {
       SmallVector<int, 16> ScaledMask;
       int Scale = NumSrcElts / Mask.size();
-      scaleShuffleMask<int>(Scale, Mask, ScaledMask);
+      scaleShuffleMask(Scale, Mask, ScaledMask);
       Mask = std::move(ScaledMask);
     } else if ((Mask.size() % NumSrcElts) == 0) {
       // Simplify Mask based on demanded element.
@@ -39752,6 +39781,81 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
+/// to avoid the inversion.
+static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
+                              SelectionDAG &DAG) {
+  // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
+  if (EFLAGS.getOpcode() != X86ISD::PTEST &&
+      EFLAGS.getOpcode() != X86ISD::TESTP)
+    return SDValue();
+
+  // PTEST/TESTP sets EFLAGS as:
+  // TESTZ: ZF = (Op0 & Op1) == 0
+  // TESTC: CF = (~Op0 & Op1) == 0
+  // TESTNZC: ZF == 0 && CF == 0
+  EVT VT = EFLAGS.getValueType();
+  SDValue Op0 = EFLAGS.getOperand(0);
+  SDValue Op1 = EFLAGS.getOperand(1);
+  EVT OpVT = Op0.getValueType();
+
+  // TEST*(~X,Y) == TEST*(X,Y)
+  if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
+    X86::CondCode InvCC;
+    switch (CC) {
+    case X86::COND_B:
+      // testc -> testz.
+      InvCC = X86::COND_E;
+      break;
+    case X86::COND_AE:
+      // !testc -> !testz.
+      InvCC = X86::COND_NE;
+      break;
+    case X86::COND_E:
+      // testz -> testc.
+      InvCC = X86::COND_B;
+      break;
+    case X86::COND_NE:
+      // !testz -> !testc.
+      InvCC = X86::COND_AE;
+      break;
+    case X86::COND_A:
+    case X86::COND_BE:
+      // testnzc -> testnzc (no change).
+      InvCC = CC;
+      break;
+    default:
+      InvCC = X86::COND_INVALID;
+      break;
+    }
+
+    if (InvCC != X86::COND_INVALID) {
+      CC = InvCC;
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+                         DAG.getBitcast(OpVT, NotOp0), Op1);
+    }
+  }
+
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    // TESTZ(X,~Y) == TESTC(Y,X)
+    if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
+      CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+                         DAG.getBitcast(OpVT, NotOp1), Op0);
+    }
+
+    // TESTZ(-1,X) == TESTZ(X,X)
+    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
+
+    // TESTZ(X,-1) == TESTZ(X,X)
+    if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
+  }
+
+  return SDValue();
+}
+
 /// Optimize an EFLAGS definition used according to the condition code \p CC
 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
 /// uses of chain values.
@@ -39764,6 +39868,10 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
 
   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
     return R;
+
+  if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG))
+    return R;
+
   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
 }
 
@@ -44173,11 +44281,9 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
       MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
-                                  LN->getPointerInfo(),
-                                  LN->getAlignment(),
-                                  LN->getMemOperand()->getFlags());
+      SDValue VZLoad = DAG.getMemIntrinsicNode(
+          X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, LN->getPointerInfo(),
+          LN->getAlign(), LN->getMemOperand()->getFlags());
       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
                                     DAG.getBitcast(InVT, VZLoad));
       DCI.CombineTo(N, Convert);
@@ -44209,11 +44315,9 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
       MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
-                                  LN->getPointerInfo(),
-                                  LN->getAlignment(),
-                                  LN->getMemOperand()->getFlags());
+      SDValue VZLoad = DAG.getMemIntrinsicNode(
+          X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, LN->getPointerInfo(),
+          LN->getAlign(), LN->getMemOperand()->getFlags());
       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
                                     DAG.getBitcast(InVT, VZLoad));
       DCI.CombineTo(N, Convert);
@@ -44298,11 +44402,9 @@ static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
         SDLoc dl(N);
         SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-        SDValue VZLoad =
-            DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64,
-                                    LN->getPointerInfo(),
-                                    LN->getAlignment(),
-                                    LN->getMemOperand()->getFlags());
+        SDValue VZLoad = DAG.getMemIntrinsicNode(
+            X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, LN->getPointerInfo(),
+            LN->getAlign(), LN->getMemOperand()->getFlags());
         SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
                                       DAG.getBitcast(MVT::v8i16, VZLoad));
         DCI.CombineTo(N, Convert);
@@ -45546,7 +45648,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
       std::pair<SDValue, SDValue> Tmp =
           Subtarget.getTargetLowering()->BuildFILD(
               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
-              Ld->getPointerInfo(), Ld->getAlignment(), DAG);
+              Ld->getPointerInfo(), Ld->getAlign(), DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
       return Tmp.first;
     }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index c9fea553e282..5b18c09f9cf2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1215,7 +1215,7 @@ namespace llvm {
     std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
                                           SDValue Chain, SDValue Pointer,
                                           MachinePointerInfo PtrInfo,
-                                          unsigned Align,
+                                          Align Alignment,
                                           SelectionDAG &DAG) const;
 
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
@@ -1564,35 +1564,14 @@ namespace llvm {
   };
 
   /// Generate unpacklo/unpackhi shuffle mask.
-  template <typename T = int>
-  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
-                               bool Unary) {
-    assert(Mask.empty() && "Expected an empty shuffle mask vector");
-    int NumElts = VT.getVectorNumElements();
-    int NumEltsInLane = 128 / VT.getScalarSizeInBits();
-    for (int i = 0; i < NumElts; ++i) {
-      unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
-      int Pos = (i % NumEltsInLane) / 2 + LaneStart;
-      Pos += (Unary ? 0 : NumElts * (i % 2));
-      Pos += (Lo ? 0 : NumEltsInLane / 2);
-      Mask.push_back(Pos);
-    }
-  }
+  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+                               bool Unary);
 
   /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
   /// imposed by AVX and specific to the unary pattern. Example:
   /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
   /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
-  template <typename T = int>
-  void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo) {
-    assert(Mask.empty() && "Expected an empty shuffle mask vector");
-    int NumElts = VT.getVectorNumElements();
-    for (int i = 0; i < NumElts; ++i) {
-      int Pos = i / 2;
-      Pos += (Lo ? 0 : NumElts / 2);
-      Mask.push_back(Pos);
-    }
-  }
+  void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index f0288adf52ce..8c3b18505157 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -325,19 +325,19 @@ void X86InterleavedAccessGroup::interleave8bitStride4VF8(
 
   MVT VT = MVT::v8i16;
   TransposedMatrix.resize(2);
-  SmallVector<uint32_t, 16> MaskLow;
-  SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
-  SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
+  SmallVector<int, 16> MaskLow;
+  SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
+  SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
 
   for (unsigned i = 0; i < 8; ++i) {
     MaskLow.push_back(i);
     MaskLow.push_back(i + 8);
   }
 
-  createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
-  createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
-  scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
-  scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
+  createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
+  createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
+  scaleShuffleMask(2, MaskHighTemp1, MaskHighWord);
+  scaleShuffleMask(2, MaskLowTemp1, MaskLowWord);
   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
   Value *IntrVec1Low =
@@ -367,25 +367,25 @@ void X86InterleavedAccessGroup::interleave8bitStride4(
   MVT HalfVT = scaleVectorType(VT);
 
   TransposedMatrix.resize(4);
-  SmallVector<uint32_t, 32> MaskHigh;
-  SmallVector<uint32_t, 32> MaskLow;
-  SmallVector<uint32_t, 32> LowHighMask[2];
-  SmallVector<uint32_t, 32> MaskHighTemp;
-  SmallVector<uint32_t, 32> MaskLowTemp;
+  SmallVector<int, 32> MaskHigh;
+  SmallVector<int, 32> MaskLow;
+  SmallVector<int, 32> LowHighMask[2];
+  SmallVector<int, 32> MaskHighTemp;
+  SmallVector<int, 32> MaskLowTemp;
 
   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
   // shuffle pattern.
 
-  createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
-  createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
+  createUnpackShuffleMask(VT, MaskLow, true, false);
+  createUnpackShuffleMask(VT, MaskHigh, false, false);
 
   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
   // shuffle pattern.
 
-  createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
-  createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
-  scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
-  scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
+  createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
+  createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
+  scaleShuffleMask(2, MaskLowTemp, LowHighMask[0]);
+  scaleShuffleMask(2, MaskHighTemp, LowHighMask[1]);
 
   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 02fff40261d1..1e5591fd8d18 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -5030,6 +5030,7 @@ struct AAHeapToStackImpl : public AAHeapToStack {
       LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
                         << "\n");
 
+      MaybeAlign Alignment;
       Constant *Size;
       if (isCallocLikeFn(MallocCall, TLI)) {
         auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
@@ -5037,13 +5038,19 @@ struct AAHeapToStackImpl : public AAHeapToStack {
         APInt TotalSize = SizeT->getValue() * Num->getValue();
         Size =
             ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
+      } else if (isAlignedAllocLikeFn(MallocCall, TLI)) {
+        Size = cast<ConstantInt>(MallocCall->getOperand(1));
+        Alignment = MaybeAlign(cast<ConstantInt>(MallocCall->getOperand(0))
+                                   ->getValue()
+                                   .getZExtValue());
       } else {
         Size = cast<ConstantInt>(MallocCall->getOperand(0));
       }
 
       unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
-      Instruction *AI = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
-                                       Size, "", MallocCall->getNextNode());
+      Instruction *AI =
+          new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
+                         "", MallocCall->getNextNode());
 
       if (AI->getType() != MallocCall->getType())
         AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
@@ -5175,8 +5182,9 @@ ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
       return true;
 
     bool IsMalloc = isMallocLikeFn(&I, TLI);
+    bool IsAlignedAllocLike = isAlignedAllocLikeFn(&I, TLI);
     bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
-    if (!IsMalloc && !IsCalloc) {
+    if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) {
       BadMallocCalls.insert(&I);
       return true;
     }
@@ -5188,6 +5196,14 @@ ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
             MallocCalls.insert(&I);
             return true;
           }
+    } else if (IsAlignedAllocLike && isa<ConstantInt>(I.getOperand(0))) {
+      // Only if the alignment and sizes are constant.
+      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+        if (Size->getValue().ule(MaxHeapToStackSize))
+          if (UsesCheck(I) || FreeCheck(I)) {
+            MallocCalls.insert(&I);
+            return true;
+          }
     } else if (IsCalloc) {
       bool Overflow = false;
       if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
@@ -5219,8 +5235,9 @@ struct AAHeapToStackFunction final : public AAHeapToStackImpl {
 
   /// See AbstractAttribute::trackStatistics().
   void trackStatistics() const override {
-    STATS_DECL(MallocCalls, Function,
-               "Number of malloc calls converted to allocas");
+    STATS_DECL(
+        MallocCalls, Function,
+        "Number of malloc/calloc/aligned_alloc calls converted to allocas");
     for (auto *C : MallocCalls)
       if (!BadMallocCalls.count(C))
         ++BUILD_STAT_NAME(MallocCalls, Function);
@@ -7292,6 +7309,16 @@ struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
 ///                               Attributor
 /// ----------------------------------------------------------------------------
 
+Attributor::~Attributor() {
+  // The abstract attributes are allocated via the BumpPtrAllocator Allocator,
+  // thus we cannot delete them. We can, and want to, destruct them though.
+  for (AbstractAttribute *AA : AllAbstractAttributes)
+    AA->~AbstractAttribute();
+
+  for (auto &It : ArgumentReplacementMap)
+    DeleteContainerPointers(It.second);
+}
+
 bool Attributor::isAssumedDead(const AbstractAttribute &AA,
                                const AAIsDead *FnLivenessAA,
                                bool CheckBBLivenessOnly, DepClassTy DepClass) {
@@ -8874,7 +8901,7 @@ const char AAValueConstantRange::ID = 0;
 
 #define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX)                               \
   case IRPosition::PK:                                                         \
-    AA = new CLASS##SUFFIX(IRP);                                               \
+    AA = new (A.Allocator) CLASS##SUFFIX(IRP);                                 \
     break;
 
 #define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                 \
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 899e8b876a48..195916ead66a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -72,7 +72,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
     // We know that this is an exact/nuw shift and that the input is a
     // non-zero context as well.
     if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
-      I->setOperand(0, V2);
+      IC.replaceOperand(*I, 0, V2);
       MadeChange = true;
     }
 
@@ -591,7 +591,7 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
     return false;
 
   // Change the div/rem to use 'Y' instead of the select.
-  I.setOperand(1, SI->getOperand(NonNullOperand));
+  replaceOperand(I, 1, SI->getOperand(NonNullOperand));
 
   // Okay, we know we replace the operand of the div/rem with 'Y' with no
   // problem.  However, the select, or the condition of the select may have
@@ -619,11 +619,11 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
     for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
          I != E; ++I) {
       if (*I == SI) {
-        *I = SI->getOperand(NonNullOperand);
+        replaceUse(*I, SI->getOperand(NonNullOperand));
         Worklist.push(&*BBI);
       } else if (*I == SelectCond) {
-        *I = NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
-                                 : ConstantInt::getFalse(CondTy);
+        replaceUse(*I, NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
+                                           : ConstantInt::getFalse(CondTy));
         Worklist.push(&*BBI);
       }
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 887369e64852..15ecfe96c48d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2696,8 +2696,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // paths for the values (this helps GetUnderlyingObjects() for example).
       if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
         Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
-        SI.setOperand(0, And);
-        SI.setOperand(1, TrueSI->getTrueValue());
+        replaceOperand(SI, 0, And);
+        replaceOperand(SI, 1, TrueSI->getTrueValue());
         return &SI;
       }
     }
@@ -2713,8 +2713,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
       if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
         Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition());
-        SI.setOperand(0, Or);
-        SI.setOperand(2, FalseSI->getFalseValue());
+        replaceOperand(SI, 0, Or);
+        replaceOperand(SI, 2, FalseSI->getFalseValue());
         return &SI;
       }
     }
@@ -2741,14 +2741,14 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       canMergeSelectThroughBinop(TrueBO)) {
     if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) {
       if (TrueBOSI->getCondition() == CondVal) {
-        TrueBO->setOperand(0, TrueBOSI->getTrueValue());
+        replaceOperand(*TrueBO, 0, TrueBOSI->getTrueValue());
         Worklist.push(TrueBO);
         return &SI;
       }
     }
     if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(1))) {
       if (TrueBOSI->getCondition() == CondVal) {
-        TrueBO->setOperand(1, TrueBOSI->getTrueValue());
+        replaceOperand(*TrueBO, 1, TrueBOSI->getTrueValue());
         Worklist.push(TrueBO);
         return &SI;
       }
@@ -2761,14 +2761,14 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       canMergeSelectThroughBinop(FalseBO)) {
     if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) {
       if (FalseBOSI->getCondition() == CondVal) {
-        FalseBO->setOperand(0, FalseBOSI->getFalseValue());
+        replaceOperand(*FalseBO, 0, FalseBOSI->getFalseValue());
         Worklist.push(FalseBO);
         return &SI;
       }
     }
     if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(1))) {
       if (FalseBOSI->getCondition() == CondVal) {
-        FalseBO->setOperand(1, FalseBOSI->getFalseValue());
+        replaceOperand(*FalseBO, 1, FalseBOSI->getFalseValue());
         Worklist.push(FalseBO);
         return &SI;
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 23370cf21f07..c3890ed69421 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -910,6 +910,18 @@ Value *InstCombiner::freelyNegateValue(Value *V) {
     return Builder.CreateSub(
         I->getOperand(1), I->getOperand(0), I->getName() + ".neg");
 
+  // Negation is equivalent to bitwise-not + 1:
+  // 0 - (A ^ C)  =>  ((A ^ C) ^ -1) + 1  =>  A ^ ~C + 1
+  case Instruction::Xor: {
+    Constant *C;
+    if (match(I->getOperand(1), m_Constant(C))) {
+      Value *Xor = Builder.CreateXor(I->getOperand(0), ConstantExpr::getNot(C));
+      return Builder.CreateAdd(Xor, ConstantInt::get(Xor->getType(), 1),
+                               I->getName() + ".neg");
+    }
+    return nullptr;
+  }
+
   // 0-(A sdiv C)  =>  A sdiv (0-C)  provided the negation doesn't overflow.
   case Instruction::SDiv: {
     Constant *C = dyn_cast<Constant>(I->getOperand(1));
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index d831b0da37a5..2a6ecfcd957a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -947,6 +947,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
 
   // Loading the allocation -> undef.
   if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      isAlignedAllocLikeFn(DepInst, TLI) ||
       // Loading immediately after lifetime begin -> undef.
       isLifetimeStart(DepInst)) {
     Res = AvailableValue::get(UndefValue::get(LI->getType()));
@@ -1451,7 +1452,7 @@ static bool impliesEquivalanceIfFalse(CmpInst* Cmp) {
       Value *LHS = Cmp->getOperand(0);
       Value *RHS = Cmp->getOperand(1);
       // If we can prove either side non-zero, then equality must imply
-      // equivalence. 
+      // equivalence.
       // FIXME: We should do this optimization if 'no signed zeros' is
       // applicable via an instruction-level fast-math-flag or some other
       // indicator that relaxed FP semantics are being used.
@@ -1516,10 +1517,10 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
   // If we find an equality fact, canonicalize all dominated uses in this block
   // to one of the two values.  We heuristically choice the "oldest" of the
   // two where age is determined by value number. (Note that propagateEquality
-  // above handles the cross block case.) 
-  // 
+  // above handles the cross block case.)
+  //
   // Key case to cover are:
-  // 1) 
+  // 1)
   // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
   // call void @llvm.assume(i1 %cmp)
   // ret float %0 ; will change it to ret float 3.000000e+00
@@ -1560,7 +1561,7 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
                  << *CmpLHS << " with "
                  << *CmpRHS << " in block "
                  << IntrinsicI->getParent()->getName() << "\n");
-      
+
 
       // Setup the replacement map - this handles uses within the same block
       if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
@@ -1826,7 +1827,7 @@ void GVN::assignBlockRPONumber(Function &F) {
 bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
   bool Changed = false;
   for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
-    Value *Operand = Instr->getOperand(OpNum); 
+    Value *Operand = Instr->getOperand(OpNum);
     auto it = ReplaceOperandsWithMap.find(Operand);
     if (it != ReplaceOperandsWithMap.end()) {
       LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
@@ -1946,7 +1947,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
 
       // If "A == B" is known true, or "A != B" is known false, then replace
       // A with B everywhere in the scope.  For floating point operations, we
-      // have to be careful since equality does not always imply equivalance.  
+      // have to be careful since equality does not always imply equivalance.
       if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) ||
           (isKnownFalse && impliesEquivalanceIfFalse(Cmp)))
         Worklist.push_back(std::make_pair(Op0, Op1));
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index d0b96218137c..5fee60fc6bf9 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -350,7 +350,7 @@ using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
 class InstructionUseExpr : public GVNExpression::BasicExpression {
   unsigned MemoryUseOrder = -1;
   bool Volatile = false;
-  std::vector<int> ShuffleMask;
+  ArrayRef<int> ShuffleMask;
 
 public:
   InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
@@ -360,6 +360,9 @@ class InstructionUseExpr : public GVNExpression::BasicExpression {
     setOpcode(I->getOpcode());
     setType(I->getType());
 
+    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
+      ShuffleMask = SVI->getShuffleMask().copy(A);
+
     for (auto &U : I->uses())
       op_push_back(U.getUser());
     llvm::sort(op_begin(), op_end());
@@ -367,18 +370,15 @@ class InstructionUseExpr : public GVNExpression::BasicExpression {
 
   void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
   void setVolatile(bool V) { Volatile = V; }
-  void setShuffleMask(ArrayRef<int> Mask) {
-    ShuffleMask.assign(Mask.begin(), Mask.end());
-  }
 
   hash_code getHashValue() const override {
     return hash_combine(GVNExpression::BasicExpression::getHashValue(),
-                        MemoryUseOrder, Volatile, ArrayRef<int>(ShuffleMask));
+                        MemoryUseOrder, Volatile, ShuffleMask);
   }
 
   template <typename Function> hash_code getHashValue(Function MapFn) {
     hash_code H = hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile,
-                               ArrayRef<int>(ShuffleMask));
+                               ShuffleMask);
     for (auto *V : operands())
       H = hash_combine(H, MapFn(V));
     return H;
@@ -406,8 +406,6 @@ class ValueTable {
       CmpInst::Predicate Predicate = C->getPredicate();
       E->setOpcode((C->getOpcode() << 8) | Predicate);
     }
-    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
-      E->setShuffleMask(SVI->getShuffleMask());
     return E;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 327a1a6f2e7b..a9a0070c1d57 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -2833,6 +2833,16 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
 /// select is not jump-threaded, it will be folded again in the later
 /// optimizations.
 bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+  // This transform can introduce a UB (a conditional branch that depends on a
+  // poison value) that was not present in the original program. See
+  // @TryToUnfoldSelectInCurrBB test in test/Transforms/JumpThreading/select.ll.
+  // Disable this transform under MemorySanitizer.
+  // FIXME: either delete it or replace with a valid transform. This issue is
+  // not limited to MemorySanitizer (but has only been observed as an MSan false
+  // positive in practice so far).
+  if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
   // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above FindLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB))
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 17b2c534513d..b93a8bfeaa46 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1470,7 +1470,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.  Note that this is only true in the case
   // that the result of the allocation is pointer equal to the load ptr.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      isAlignedAllocLikeFn(DepInst, TLI)) {
     return createConstantExpression(UndefValue::get(LoadType));
   }
   // If this load occurs either right after a lifetime begin,
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 0c51fd5ff423..05025747db8e 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -662,7 +662,8 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
 public:
   SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
       : PtrUseVisitor<SliceBuilder>(DL),
-        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
+        AS(AS) {}
 
 private:
   void markAsDead(Instruction &I) {
@@ -751,8 +752,10 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
           // For array or vector indices, scale the index by the size of the
           // type.
           APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
-          GEPOffset += Index * APInt(Offset.getBitWidth(),
-                                     DL.getTypeAllocSize(GTI.getIndexedType()));
+          GEPOffset +=
+              Index *
+              APInt(Offset.getBitWidth(),
+                    DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
         }
 
         // If this index has computed an intermediate pointer which is not
@@ -787,7 +790,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
         LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
       return PI.setAborted(&LI);
 
-    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
     return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
   }
 
@@ -802,7 +805,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
         SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
       return PI.setAborted(&SI);
 
-    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
 
     // If this memory access can be shown to *statically* extend outside the
     // bounds of the allocation, it's behavior is undefined, so simply
@@ -1220,7 +1223,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
       if (BBI->mayWriteToMemory())
         return false;
 
-    uint64_t Size = DL.getTypeStoreSize(LI->getType());
+    uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
     MaxAlign = std::max(MaxAlign, MaybeAlign(LI->getAlignment()));
     MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
     HaveLoad = true;
@@ -1478,7 +1481,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+    unsigned ElementSizeInBits =
+        DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
     if (ElementSizeInBits % 8 != 0) {
       // GEPs over non-multiple of 8 size vector elements are invalid.
       return nullptr;
@@ -1495,7 +1499,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
     Type *ElementTy = ArrTy->getElementType();
-    APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+    APInt ElementSize(Offset.getBitWidth(),
+                      DL.getTypeAllocSize(ElementTy).getFixedSize());
     APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
       return nullptr;
@@ -1517,7 +1522,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   unsigned Index = SL->getElementContainingOffset(StructOffset);
   Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
   Type *ElementTy = STy->getElementType(Index);
-  if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
+  if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
     return nullptr; // The offset points into alignment padding.
 
   Indices.push_back(IRB.getInt32(Index));
@@ -1549,7 +1554,8 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   Type *ElementTy = Ty->getElementType();
   if (!ElementTy->isSized())
     return nullptr; // We can't GEP through an unsized element.
-  APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+  APInt ElementSize(Offset.getBitWidth(),
+                    DL.getTypeAllocSize(ElementTy).getFixedSize());
   if (ElementSize == 0)
     return nullptr; // Zero-length arrays can't help us build a natural GEP.
   APInt NumSkippedElements = Offset.sdiv(ElementSize);
@@ -1716,7 +1722,8 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
     return false;
   }
 
-  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
+      DL.getTypeSizeInBits(OldTy).getFixedSize())
     return false;
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
@@ -1889,7 +1896,8 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
       // Return if bitcast to vectors is different for total size in bits.
       if (!CandidateTys.empty()) {
         VectorType *V = CandidateTys[0];
-        if (DL.getTypeSizeInBits(VTy) != DL.getTypeSizeInBits(V)) {
+        if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
+            DL.getTypeSizeInBits(V).getFixedSize()) {
           CandidateTys.clear();
           return;
         }
@@ -1935,7 +1943,8 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     // they're all integer vectors. We sort by ascending number of elements.
     auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
       (void)DL;
-      assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+      assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
+                 DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
              "Cannot have vector types of different sizes!");
       assert(RHSTy->getElementType()->isIntegerTy() &&
              "All non-integer types eliminated!");
@@ -1963,13 +1972,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
 
   // Try each vector type, and return the one which works.
   auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+    uint64_t ElementSize =
+        DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
 
     // While the definition of LLVM vectors is bitpacked, we don't support sizes
     // that aren't byte sized.
     if (ElementSize % 8)
       return false;
-    assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+    assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
            "vector size not a multiple of element size?");
     ElementSize /= 8;
 
@@ -1999,7 +2009,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
                                             Type *AllocaTy,
                                             const DataLayout &DL,
                                             bool &WholeAllocaOp) {
-  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
 
   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
@@ -2015,7 +2025,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (LI->isVolatile())
       return false;
     // We can't handle loads that extend past the allocated memory.
-    if (DL.getTypeStoreSize(LI->getType()) > Size)
+    if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerLoad.
@@ -2027,7 +2037,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
       WholeAllocaOp = true;
     if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
-      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
         return false;
     } else if (RelBegin != 0 || RelEnd != Size ||
                !canConvertValue(DL, AllocaTy, LI->getType())) {
@@ -2040,7 +2050,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (SI->isVolatile())
       return false;
     // We can't handle stores that extend past the allocated memory.
-    if (DL.getTypeStoreSize(ValueTy) > Size)
+    if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerStore.
@@ -2052,7 +2062,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
       WholeAllocaOp = true;
     if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
-      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
         return false;
     } else if (RelBegin != 0 || RelEnd != Size ||
                !canConvertValue(DL, ValueTy, AllocaTy)) {
@@ -2083,13 +2093,13 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
 /// promote the resulting alloca.
 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
                                     const DataLayout &DL) {
-  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
+  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
   // Don't create integer types larger than the maximum bitwidth.
   if (SizeInBits > IntegerType::MAX_INT_BITS)
     return false;
 
   // Don't try to handle allocas with bit-padding.
-  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
+  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
     return false;
 
   // We need to ensure that an integer type with the appropriate bitwidth can
@@ -2128,11 +2138,13 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                              const Twine &Name) {
   LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
   IntegerType *IntTy = cast<IntegerType>(V->getType());
-  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+             DL.getTypeStoreSize(IntTy).getFixedSize() &&
          "Element extends past full value");
   uint64_t ShAmt = 8 * Offset;
   if (DL.isBigEndian())
-    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
   if (ShAmt) {
     V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
@@ -2157,11 +2169,13 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
     V = IRB.CreateZExt(V, IntTy, Name + ".ext");
     LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n");
   }
-  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+             DL.getTypeStoreSize(IntTy).getFixedSize() &&
          "Element store outside of alloca store");
   uint64_t ShAmt = 8 * Offset;
   if (DL.isBigEndian())
-    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
   if (ShAmt) {
     V = IRB.CreateShl(V, ShAmt, Name + ".shift");
     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
@@ -2324,18 +2338,20 @@ class llvm::sroa::AllocaSliceRewriter
         NewAllocaBeginOffset(NewAllocaBeginOffset),
         NewAllocaEndOffset(NewAllocaEndOffset),
         NewAllocaTy(NewAI.getAllocatedType()),
-        IntTy(IsIntegerPromotable
-                  ? Type::getIntNTy(
-                        NewAI.getContext(),
-                        DL.getTypeSizeInBits(NewAI.getAllocatedType()))
-                  : nullptr),
+        IntTy(
+            IsIntegerPromotable
+                ? Type::getIntNTy(NewAI.getContext(),
+                                  DL.getTypeSizeInBits(NewAI.getAllocatedType())
+                                      .getFixedSize())
+                : nullptr),
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
+                          : 0),
         PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
     if (VecTy) {
-      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+      assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
              "Only multiple-of-8 sized vector elements are viable");
       ++NumVectorized;
     }
@@ -2500,7 +2516,8 @@ class llvm::sroa::AllocaSliceRewriter
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
-    const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
+    const bool IsLoadPastEnd =
+        DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
@@ -2568,7 +2585,7 @@ class llvm::sroa::AllocaSliceRewriter
       assert(!LI.isVolatile());
       assert(LI.getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
-      assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
+      assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
              "Split load isn't smaller than original load");
       assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
              "Non-byte-multiple bit width");
@@ -2626,7 +2643,8 @@ class llvm::sroa::AllocaSliceRewriter
   bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
-    if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+    if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
+        IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                          NewAI.getAlign(), "oldload");
       Old = convertValue(DL, IRB, Old, IntTy);
@@ -2661,7 +2679,7 @@ class llvm::sroa::AllocaSliceRewriter
       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
         Pass.PostPromotionWorklist.insert(AI);
 
-    if (SliceSize < DL.getTypeStoreSize(V->getType())) {
+    if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
       assert(!SI.isVolatile());
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
@@ -2677,7 +2695,8 @@ class llvm::sroa::AllocaSliceRewriter
     if (IntTy && V->getType()->isIntegerTy())
       return rewriteIntegerStore(V, SI, AATags);
 
-    const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
+    const bool IsStorePastEnd =
+        DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
     StoreInst *NewSI;
     if (NewBeginOffset == NewAllocaBeginOffset &&
         NewEndOffset == NewAllocaEndOffset &&
@@ -2792,7 +2811,7 @@ class llvm::sroa::AllocaSliceRewriter
       auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
       auto *SrcTy = VectorType::get(Int8Ty, Len);
       return canConvertValue(DL, SrcTy, AllocaTy) &&
-        DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy));
+             DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
     }();
 
     // If this doesn't map cleanly onto the alloca type, and that type isn't
@@ -2826,8 +2845,8 @@ class llvm::sroa::AllocaSliceRewriter
       unsigned NumElements = EndIndex - BeginIndex;
       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
 
-      Value *Splat =
-          getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+      Value *Splat = getIntegerSplat(
+          II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
       Splat = convertValue(DL, IRB, Splat, ElementTy);
       if (NumElements > 1)
         Splat = getVectorSplat(Splat, NumElements);
@@ -2860,7 +2879,8 @@ class llvm::sroa::AllocaSliceRewriter
       assert(NewBeginOffset == NewAllocaBeginOffset);
       assert(NewEndOffset == NewAllocaEndOffset);
 
-      V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
+      V = getIntegerSplat(II.getValue(),
+                          DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
         V = getVectorSplat(V, AllocaVecTy->getNumElements());
 
@@ -2923,7 +2943,8 @@ class llvm::sroa::AllocaSliceRewriter
     bool EmitMemCpy =
         !VecTy && !IntTy &&
         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
-         SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+         SliceSize !=
+             DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
          !NewAI.getAllocatedType()->isSingleValueType());
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
@@ -3469,8 +3490,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
   if (Ty->isSingleValueType())
     return Ty;
 
-  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
-  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
 
   Type *InnerTy;
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
@@ -3483,8 +3504,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
     return Ty;
   }
 
-  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
-      TypeSize > DL.getTypeSizeInBits(InnerTy))
+  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
     return Ty;
 
   return stripAggregateTypeWrapping(DL, InnerTy);
@@ -3505,15 +3526,15 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
 /// return a type if necessary.
 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
                               uint64_t Size) {
-  if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
     return stripAggregateTypeWrapping(DL, Ty);
-  if (Offset > DL.getTypeAllocSize(Ty) ||
-      (DL.getTypeAllocSize(Ty) - Offset) < Size)
+  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
+      (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
     return nullptr;
 
   if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
     Type *ElementTy = SeqTy->getElementType();
-    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
     uint64_t NumSkippedElements = Offset / ElementSize;
     if (NumSkippedElements >= SeqTy->getNumElements())
       return nullptr;
@@ -3553,7 +3574,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
   Offset -= SL->getElementOffset(Index);
 
   Type *ElementTy = STy->getElementType(Index);
-  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
   if (Offset >= ElementSize)
     return nullptr; // The offset points into alignment padding.
 
@@ -4121,7 +4142,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   Type *SliceTy = nullptr;
   const DataLayout &DL = AI.getModule()->getDataLayout();
   if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
-    if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy).getFixedSize() >= P.size())
       SliceTy = CommonUseTy;
   if (!SliceTy)
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
@@ -4133,7 +4154,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     SliceTy = Type::getIntNTy(*C, P.size() * 8);
   if (!SliceTy)
     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
-  assert(DL.getTypeAllocSize(SliceTy) >= P.size());
+  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
 
   bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
 
@@ -4274,7 +4295,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   // to be rewritten into a partition.
   bool IsSorted = true;
 
-  uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
+  uint64_t AllocaSize =
+      DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
   const uint64_t MaxBitVectorSize = 1024;
   if (AllocaSize <= MaxBitVectorSize) {
     // If a byte boundary is included in any load or store, a slice starting or
@@ -4338,7 +4360,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
       Changed = true;
       if (NewAI != &AI) {
         uint64_t SizeOfByte = 8;
-        uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
+        uint64_t AllocaSize =
+            DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
         Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
@@ -4358,7 +4381,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     auto *Expr = DbgDeclares.front()->getExpression();
     auto VarSize = Var->getSizeInBits();
     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
-    uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
+    uint64_t AllocaSize =
+        DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
     for (auto Fragment : Fragments) {
       // Create a fragment expression describing the new partition or reuse AI's
       // expression if there is only one partition.
@@ -4446,8 +4470,10 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
   const DataLayout &DL = AI.getModule()->getDataLayout();
 
   // Skip alloca forms that this analysis can't handle.
-  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
-      DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
+  auto *AT = AI.getAllocatedType();
+  if (AI.isArrayAllocation() || !AT->isSized() ||
+      (isa<VectorType>(AT) && cast<VectorType>(AT)->isScalable()) ||
+      DL.getTypeAllocSize(AT).getFixedSize() == 0)
     return false;
 
   bool Changed = false;
@@ -4567,8 +4593,15 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
        I != E; ++I) {
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
-      Worklist.insert(AI);
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      if (isa<VectorType>(AI->getAllocatedType()) &&
+          cast<VectorType>(AI->getAllocatedType())->isScalable()) {
+        if (isAllocaPromotable(AI))
+          PromotableAllocas.push_back(AI);
+      } else {
+        Worklist.insert(AI);
+      }
+    }
   }
 
   bool Changed = false;
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 0eca6704b496..deff56b9e27e 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -522,7 +522,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
     //   sext(a + b) = sext(a) + sext(b)
     // even if the addition is not marked nsw.
     //
-    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+    // Leveraging this invariant, we can trace into an sext'ed inbound GEP
     // index if the constant offset is non-negative.
     //
     // Verified in @sext_add in split-gep.ll.
@@ -552,6 +552,9 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
 APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
                                                    bool SignExtended,
                                                    bool ZeroExtended) {
+  // Save off the current height of the chain, in case we need to restore it.
+  size_t ChainLength = UserChain.size();
+
   // BO being non-negative does not shed light on whether its operands are
   // non-negative. Clear the NonNegative flag here.
   APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
@@ -562,12 +565,22 @@ APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
   // However, such cases are probably already handled by -instcombine,
   // given this pass runs after the standard optimizations.
   if (ConstantOffset != 0) return ConstantOffset;
+
+  // Reset the chain back to where it was when we started exploring this node,
+  // since visiting the LHS didn't pan out.
+  UserChain.resize(ChainLength);
+
   ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
                         /* NonNegative */ false);
   // If U is a sub operator, negate the constant offset found in the right
   // operand.
   if (BO->getOpcode() == Instruction::Sub)
     ConstantOffset = -ConstantOffset;
+
+  // If RHS wasn't a suitable candidate either, reset the chain again.
+  if (ConstantOffset == 0)
+    UserChain.resize(ChainLength);
+
   return ConstantOffset;
 }
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 36d05345f23b..b8b3d1895093 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -80,21 +80,11 @@ EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
   cl::Hidden,
   cl::desc("Convert noalias attributes to metadata during inlining."));
 
-static cl::opt<bool> UpdateReturnAttributes(
-    "update-return-attrs", cl::init(true), cl::Hidden,
-    cl::desc("Update return attributes on calls within inlined body"));
-
 static cl::opt<bool>
 PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
   cl::init(true), cl::Hidden,
   cl::desc("Convert align attributes to assumptions during inlining."));
 
-static cl::opt<unsigned> InlinerAttributeWindow(
-    "inliner-attribute-window", cl::Hidden,
-    cl::desc("the maximum number of instructions analyzed for may throw during "
-             "attribute inference in inlined body"),
-    cl::init(4));
-
 llvm::InlineResult llvm::InlineFunction(CallBase *CB, InlineFunctionInfo &IFI,
                                         AAResults *CalleeAAR,
                                         bool InsertLifetime) {
@@ -1146,81 +1136,6 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
   }
 }
 
-static bool MayContainThrowingOrExitingCall(Instruction *Begin,
-                                            Instruction *End) {
-
-  assert(Begin->getParent() == End->getParent() &&
-         "Expected to be in same basic block!");
-  unsigned NumInstChecked = 0;
-  // Check that all instructions in the range [Begin, End) are guaranteed to
-  // transfer execution to successor.
-  for (auto &I : make_range(Begin->getIterator(), End->getIterator()))
-    if (NumInstChecked++ > InlinerAttributeWindow ||
-        !isGuaranteedToTransferExecutionToSuccessor(&I))
-      return true;
-  return false;
-}
-
-static void AddReturnAttributes(CallSite CS, ValueToValueMapTy &VMap) {
-  if (!UpdateReturnAttributes)
-    return;
-  AttrBuilder AB(CS.getAttributes(), AttributeList::ReturnIndex);
-  if (AB.empty())
-    return;
-
-  auto *CalledFunction = CS.getCalledFunction();
-  auto &Context = CalledFunction->getContext();
-
-  for (auto &BB : *CalledFunction) {
-    auto *RI = dyn_cast<ReturnInst>(BB.getTerminator());
-    if (!RI || !isa<CallBase>(RI->getOperand(0)))
-      continue;
-    // Sanity check that the cloned return instruction exists and is a return
-    // instruction itself.
-    auto *NewRI = dyn_cast_or_null<ReturnInst>(VMap.lookup(RI));
-    if (!NewRI)
-      continue;
-    auto *RetVal = cast<CallBase>(RI->getOperand(0));
-    // Sanity check that the cloned RetVal exists and is a call.
-    // Simplification during inlining could have transformed the cloned
-    // instruction.
-    auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
-    if (!NewRetVal)
-      continue;
-    // Backward propagation of attributes to the returned value may be incorrect
-    // if it is control flow dependent.
-    // Consider:
-    // @callee {
-    //  %rv = call @foo()
-    //  %rv2 = call @bar()
-    //  if (%rv2 != null)
-    //    return %rv2
-    //  if (%rv == null)
-    //    exit()
-    //  return %rv
-    // }
-    // caller() {
-    //   %val = call nonnull @callee()
-    // }
-    // Here we cannot add the nonnull attribute on either foo or bar. So, we
-    // limit the check to both NewRetVal and NewRI are in the same basic block
-    // and there are no throwing/exiting instructions between these
-    // instructions.
-    if (NewRI->getParent() != NewRetVal->getParent() ||
-        MayContainThrowingOrExitingCall(NewRetVal, NewRI))
-      continue;
-    // Add to the existing attributes of NewRetVal.
-    // NB! When we have the same attribute already existing on NewRetVal, but
-    // with a differing value, the AttributeList's merge API honours the already
-    // existing attribute value (i.e. attributes such as dereferenceable,
-    // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
-    AttributeList AL = NewRetVal->getAttributes();
-    AttributeList NewAL =
-        AL.addAttributes(Context, AttributeList::ReturnIndex, AB);
-    NewRetVal->setAttributes(NewAL);
-  }
-}
-
 /// If the inlined function has non-byval align arguments, then
 /// add @llvm.assume-based alignment assumptions to preserve this information.
 static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
@@ -1886,10 +1801,6 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Add noalias metadata if necessary.
     AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR);
 
-    // Clone return attributes on the callsite into the calls within the inlined
-    // function which feed into its return value.
-    AddReturnAttributes(CS, VMap);
-
     // Propagate llvm.mem.parallel_loop_access if necessary.
     PropagateParallelLoopAccessMetadata(CS, VMap);
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 77fe6c1cb12a..10eb1212a591 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -832,6 +832,7 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) {
           B.CreateICmp(Old->getPredicate(), StrNCmp,
                        ConstantInt::getNullValue(StrNCmp->getType()), "cmp");
       replaceAllUsesWith(Old, Cmp);
+      eraseFromParent(Old);
     }
     return CI;
   }
@@ -2170,8 +2171,10 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
 
   auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
                                  Value *Res) {
-    for (CallInst *C : Calls)
+    for (CallInst *C : Calls) {
       replaceAllUsesWith(C, Res);
+      eraseFromParent(C);
+    }
   };
 
   replaceTrigInsts(SinCalls, Sin);
diff --git a/llvm/test/CodeGen/AArch64/funclet-match-add-sub-stack.ll b/llvm/test/CodeGen/AArch64/funclet-match-add-sub-stack.ll
new file mode 100644
index 000000000000..67e9c49675cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/funclet-match-add-sub-stack.ll
@@ -0,0 +1,62 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows | FileCheck %s
+; Check that the stack bump around a funclet is computed correctly in both the
+; prologue and epilogue in the case we have a MaxCallFrameSize > 0 and are doing alloca
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-pc-windows-msvc19.25.28611"
+
+; // requires passing arguments on the stack
+; void test2(void*, int, int, int, int, int, int, int, int);
+;
+; // function with the funclet being checked
+; void test1(size_t bytes)
+; {
+;   // alloca forces a separate callee save bump and stack bump
+;   void *data = _alloca(bytes);
+;   try {
+;     test2(data, 0, 1, 2, 3, 4, 5, 6, 7);
+;   } catch (...) {
+;     // the funclet being checked
+;   }
+; }
+
+; CHECK-LABEL: ?catch$2@?0??test1@@YAX_K@Z@4HA
+; CHECK: sub sp, sp, #16
+; CHECK: add sp, sp, #16
+; Function Attrs: uwtable
+define dso_local void @"?test1@@YAX_K@Z"(i64 %0) #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+  %2 = alloca i64, align 8
+  %3 = alloca i8*, align 8
+  store i64 %0, i64* %2, align 8
+  %4 = load i64, i64* %2, align 8
+  %5 = alloca i8, i64 %4, align 16
+  store i8* %5, i8** %3, align 8
+  %6 = load i8*, i8** %3, align 8
+  invoke void @"?test2@@YAXPEAXHHHHHHHH@Z"(i8* %6, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7)
+          to label %13 unwind label %7
+
+7:                                                ; preds = %1
+  %8 = catchswitch within none [label %9] unwind to caller
+
+9:                                                ; preds = %7
+  %10 = catchpad within %8 [i8* null, i32 64, i8* null]
+  catchret from %10 to label %11
+
+11:                                               ; preds = %9
+  br label %12
+
+12:                                               ; preds = %11, %13
+  ret void
+
+13:                                               ; preds = %1
+  br label %12
+}
+
+declare dso_local void @"?test2@@YAXPEAXHHHHHHHH@Z"(i8*, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+attributes #0 = { uwtable }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
diff --git a/llvm/test/CodeGen/AArch64/seh-finally.ll b/llvm/test/CodeGen/AArch64/seh-finally.ll
index 66558c90a79c..dbc6c4b0804b 100644
--- a/llvm/test/CodeGen/AArch64/seh-finally.ll
+++ b/llvm/test/CodeGen/AArch64/seh-finally.ll
@@ -37,7 +37,7 @@ entry:
 ; CHECK-LABEL: simple_seh
 ; CHECK: add     x29, sp, #16
 ; CHECK: mov     x0, #-2
-; CHECK: stur    x0, [x29, #-16]
+; CHECK: stur    x0, [x29, #16]
 ; CHECK: .set .Lsimple_seh$frame_escape_0, -8
 ; CHECK: ldur    w0, [x29, #-8]
 ; CHECK: bl      foo
@@ -87,13 +87,13 @@ define void @stack_realign() #0 personality i8* bitcast (i32 (...)* @__C_specifi
 entry:
 ; CHECK-LABEL: stack_realign
 ; CHECK: mov     x29, sp
-; CHECK: sub     x9, sp, #64
+; CHECK: sub     x9, sp, #16
 ; CHECK: and     sp, x9, #0xffffffffffffffe0
 ; CHECK: mov     x19, sp
 ; CHECK: mov     x0, #-2
-; CHECK: stur    x0, [x19, #16]
-; CHECK: .set .Lstack_realign$frame_escape_0, 32
-; CHECK: ldr     w0, [x19, #32]
+; CHECK: stur    x0, [x29, #32]
+; CHECK: .set .Lstack_realign$frame_escape_0, 0
+; CHECK: ldr     w0, [x19]
 ; CHECK: bl      foo
 
   %o = alloca %struct.S, align 32
@@ -142,7 +142,7 @@ entry:
 ; CHECK-LABEL: vla_present
 ; CHECK: add     x29, sp, #32
 ; CHECK: mov     x1, #-2
-; CHECK: stur    x1, [x29, #-32]
+; CHECK: stur    x1, [x29, #16]
 ; CHECK: .set .Lvla_present$frame_escape_0, -4
 ; CHECK: stur    w0, [x29, #-4]
 ; CHECK: ldur    w8, [x29, #-4]
@@ -206,17 +206,17 @@ define void @vla_and_realign(i32 %n) #0 personality i8* bitcast (i32 (...)* @__C
 entry:
 ; CHECK-LABEL: vla_and_realign
 ; CHECK: mov     x29, sp
-; CHECK: sub     x9, sp, #64
+; CHECK: sub     x9, sp, #48
 ; CHECK: and     sp, x9, #0xffffffffffffffe0
 ; CHECK: mov     x19, sp
 ; CHECK: mov     x1, #-2
-; CHECK: stur    x1, [x19]
+; CHECK: stur    x1, [x29, #32]
 ; CHECK: .set .Lvla_and_realign$frame_escape_0, 32
-; CHECK: str     w0, [x29, #28]
-; CHECK: ldr     w8, [x29, #28]
+; CHECK: str     w0, [x29, #44]
+; CHECK: ldr     w8, [x29, #44]
 ; CHECK: mov     x9, sp
-; CHECK: str     x9, [x19, #24]
-; CHECK: str     x8, [x19, #16]
+; CHECK: str     x9, [x29, #24]
+; CHECK: str     x8, [x19, #24]
 ; CHECK: ldr     w0, [x19, #32]
 ; CHECK: bl      foo
 
diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch-cbz.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch-cbz.ll
index d84c07f8bc1a..cbed64ab99e3 100644
--- a/llvm/test/CodeGen/AArch64/wineh-try-catch-cbz.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-try-catch-cbz.ll
@@ -4,11 +4,10 @@
 ; but the original issue only reproduced if the cbz was immediately
 ; after the frame setup.)
 
-; CHECK:      sub     sp, sp, #32
-; CHECK-NEXT: stp     x29, x30, [sp, #16]
-; CHECK-NEXT: add     x29, sp, #16
+; CHECK: stp     x29, x30, [sp, #-32]!
+; CHECK-NEXT: mov     x29, sp
 ; CHECK-NEXT: mov     x1, #-2
-; CHECK-NEXT: stur    x1, [x29, #-16]
+; CHECK-NEXT: stur    x1, [x29, #16]
 ; CHECK-NEXT: cbz     w0, .LBB0_2
 
 target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch-realign.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch-realign.ll
index b10a0f3033a0..a66ec10748e7 100644
--- a/llvm/test/CodeGen/AArch64/wineh-try-catch-realign.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-try-catch-realign.ll
@@ -12,7 +12,7 @@
 ; CHECK:      stp     x29, x30, [sp, #-32]!
 ; CHECK-NEXT: str     x28, [sp, #16]
 ; CHECK-NEXT: str     x19, [sp, #24]
-; CHECK-NEXT: add     x0, x19, #64
+; CHECK-NEXT: add     x0, x19, #0
 ; CHECK-NEXT: mov     w1, wzr
 ; CHECK-NEXT: bl      "?bb@@YAXPEAHH@Z"
 ; CHECK-NEXT: adrp    x0, .LBB0_1
diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
index 3ae2df37efe4..73909825d377 100644
--- a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
@@ -11,11 +11,11 @@
 ;    and the parent function.
 
 ; The following checks that the unwind help object has -2 stored into it at
-; fp - 400 - 256 = fp - 656, which is on-entry sp - 48 + 32 - 656 =
-; on-entry sp - 672.  We check this offset in the table later on.
+; fp + 16, which is on-entry sp - 16.
+; We check this offset in the table later on.
 
 ; CHECK-LABEL: "?func@@YAHXZ":
-; CHECK:       stp     x29, x30, [sp, #-48]!
+; CHECK:       stp     x29, x30, [sp, #-64]!
 ; CHECK:       str     x28, [sp, #16]
 ; CHECK:       str     x21, [sp, #24]
 ; CHECK:       stp     x19, x20, [sp, #32]
@@ -23,7 +23,7 @@
 ; CHECK:       sub     sp, sp, #624
 ; CHECK:       mov     x19, sp
 ; CHECK:       mov     x0, #-2
-; CHECK:       stur    x0, [x19]
+; CHECK:       stur    x0, [x29, #48]
 
 ; Now check that x is stored at fp - 20.  We check that this is the same
 ; location accessed from the funclet to retrieve x.
@@ -72,7 +72,7 @@
 
 ; Now check that the offset of the unwind help object from the stack pointer on
 ; entry to func is encoded in cppxdata that is passed to __CxxFrameHandler3.  As
-; computed above, this comes to -672.
+; computed above, this comes to -16.
 ; CHECK-LABEL:        "$cppxdata$?func@@YAHXZ":
 ; CHECK-NEXT:         .word   429065506               ; MagicNumber
 ; CHECK-NEXT:         .word   2                       ; MaxState
@@ -81,7 +81,7 @@
 ; CHECK-NEXT:         .word   ("$tryMap$?func@@YAHXZ")@IMGREL ; TryBlockMap
 ; CHECK-NEXT:         .word   4                       ; IPMapEntries
 ; CHECK-NEXT:         .word   ("$ip2state$?func@@YAHXZ")@IMGREL ; IPToStateXData
-; CHECK-NEXT:         .word   -672                    ; UnwindHelp
+; CHECK-NEXT:         .word   -16                     ; UnwindHelp
 
 ; UNWIND: Function: ?func@@YAHXZ (0x0)
 ; UNWIND: Prologue [
@@ -91,7 +91,7 @@
 ; UNWIND-NEXT: ; stp x19, x20, [sp, #32]
 ; UNWIND-NEXT: ; str x21, [sp, #24]
 ; UNWIND-NEXT: ; str x28, [sp, #16]
-; UNWIND-NEXT: ; stp x29, x30, [sp, #-48]!
+; UNWIND-NEXT: ; stp x29, x30, [sp, #-64]!
 ; UNWIND-NEXT: ; end
 ; UNWIND: Function: ?catch$2@?0??func@@YAHXZ@4HA
 ; UNWIND: Prologue [
diff --git a/llvm/test/CodeGen/AArch64/wineh-unwindhelp-via-fp.ll b/llvm/test/CodeGen/AArch64/wineh-unwindhelp-via-fp.ll
new file mode 100644
index 000000000000..6ec78087020c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/wineh-unwindhelp-via-fp.ll
@@ -0,0 +1,69 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows | FileCheck %s
+; Check that we allocate the unwind help stack object in a fixed location from fp
+; so that the runtime can find it when handling an exception
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-pc-windows-msvc19.25.28611"
+
+; Check that the store to the unwind help object for func2 is via FP
+; CHECK-LABEL: ?func2@@YAXXZ
+; CHECK: mov x[[#SCRATCH_REG:]], #-2
+; CHECK: stur x[[#SCRATCH_REG:]], [x29, #[[#]]]
+;
+; // struct that requires greater than stack alignment
+; struct alignas(32) A
+; {
+;     // data that would be invalid for unwind help (> 0)
+;     int _x[4]{42, 42, 42, 42};
+;     ~A() {}
+; };
+; 
+; // cause us to run the funclet in func2
+; void func3()
+; {
+;     throw 1;
+; }
+; 
+; // the funclet that ensures we have the unwind help correct
+; void func2()
+; {
+;     A a;
+;     func3();
+; }
+; 
+; // function to ensure we are misaligned in func2
+; void func1()
+; {
+;     func2();
+; }
+; 
+; // set things up and ensure alignment for func1
+; void test()
+; {
+;     try {
+;         A a;
+;         func1();
+;     } catch(...) {}
+; }
+
+%struct.A = type { [4 x i32], [16 x i8] }
+declare dso_local %struct.A* @"??0A@@QEAA@XZ"(%struct.A* returned %0)
+declare dso_local void @"??1A@@QEAA@XZ"(%struct.A* %0)
+declare dso_local i32 @__CxxFrameHandler3(...)
+declare dso_local void @"?func3@@YAXXZ"()
+
+; Function Attrs: noinline optnone uwtable
+define dso_local void @"?func2@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+  %1 = alloca %struct.A, align 32
+  %2 = call %struct.A* @"??0A@@QEAA@XZ"(%struct.A* %1) #3
+  invoke void @"?func3@@YAXXZ"()
+          to label %3 unwind label %4
+
+3:                                                ; preds = %0
+  call void @"??1A@@QEAA@XZ"(%struct.A* %1) #3
+  ret void
+
+4:                                                ; preds = %0
+  %5 = cleanuppad within none []
+  call void @"??1A@@QEAA@XZ"(%struct.A* %1) #3 [ "funclet"(token %5) ]
+  cleanupret from %5 unwind to caller
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir
index 2e96ee4e881f..441eec00d5e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir
@@ -396,10 +396,10 @@ body: |
     ; GFX7: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX7: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX7: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX7: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %6(s16)
-    ; GFX7: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX7: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC2]]
+    ; GFX7: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+    ; GFX7: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-LABEL: name: test_fmad_s16_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -457,10 +457,10 @@ body: |
     ; GFX7: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX7: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX7: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX7: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %6(s16)
-    ; GFX7: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX7: [[FADD:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL]], [[TRUNC2]]
+    ; GFX7: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+    ; GFX7: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-LABEL: name: test_fmad_s16_denorm_flags
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -550,16 +550,16 @@ body: |
     ; GFX7: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
     ; GFX7: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; GFX7: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %10(s16)
-    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT %11(s16)
+    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC2]]
+    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC4]]
+    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[TRUNC3]]
+    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[FMUL1]], [[TRUNC5]]
+    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16)
+    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16)
     ; GFX7: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
     ; GFX7: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX7: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX7: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
-    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[TRUNC3]]
-    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC5]]
-    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC2]]
-    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[FMUL1]], [[TRUNC4]]
     ; GFX10-LABEL: name: test_fmad_v2s16_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -659,16 +659,16 @@ body: |
     ; GFX7: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
     ; GFX7: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; GFX7: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %10(s16)
-    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT %11(s16)
+    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC]], [[TRUNC2]]
+    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL]], [[TRUNC4]]
+    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC1]], [[TRUNC3]]
+    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL1]], [[TRUNC5]]
+    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16)
+    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16)
     ; GFX7: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
     ; GFX7: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX7: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX7: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
-    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC1]], [[TRUNC3]]
-    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL]], [[TRUNC5]]
-    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC]], [[TRUNC2]]
-    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL1]], [[TRUNC4]]
     ; GFX10-LABEL: name: test_fmad_v2s16_denorm_flags
     ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -820,26 +820,26 @@ body: |
     ; GFX7: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
     ; GFX7: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
     ; GFX7: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
-    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %16(s16)
-    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT %17(s16)
+    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC4]]
+    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC8]]
+    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[TRUNC5]]
+    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[FMUL1]], [[TRUNC9]]
+    ; GFX7: [[FMUL2:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC2]], [[TRUNC6]]
+    ; GFX7: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[FMUL2]], [[TRUNC10]]
+    ; GFX7: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[TRUNC7]]
+    ; GFX7: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[FMUL3]], [[TRUNC11]]
+    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16)
+    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16)
     ; GFX7: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
     ; GFX7: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX7: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX7: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT %18(s16)
-    ; GFX7: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT %19(s16)
+    ; GFX7: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FADD2]](s16)
+    ; GFX7: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[FADD3]](s16)
     ; GFX7: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
     ; GFX7: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX7: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX7: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX7: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
-    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[TRUNC7]]
-    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC11]]
-    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC2]], [[TRUNC6]]
-    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[FMUL1]], [[TRUNC10]]
-    ; GFX7: [[FMUL2:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[TRUNC5]]
-    ; GFX7: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[FMUL2]], [[TRUNC9]]
-    ; GFX7: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC4]]
-    ; GFX7: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[FMUL3]], [[TRUNC8]]
     ; GFX10-LABEL: name: test_fmad_v4s16_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
@@ -1015,26 +1015,26 @@ body: |
     ; GFX7: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
     ; GFX7: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
     ; GFX7: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
-    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %16(s16)
-    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT %17(s16)
+    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC]], [[TRUNC4]]
+    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL]], [[TRUNC8]]
+    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC1]], [[TRUNC5]]
+    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL1]], [[TRUNC9]]
+    ; GFX7: [[FMUL2:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC2]], [[TRUNC6]]
+    ; GFX7: [[FADD2:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL2]], [[TRUNC10]]
+    ; GFX7: [[FMUL3:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC3]], [[TRUNC7]]
+    ; GFX7: [[FADD3:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL3]], [[TRUNC11]]
+    ; GFX7: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16)
+    ; GFX7: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16)
     ; GFX7: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
     ; GFX7: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX7: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX7: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT %18(s16)
-    ; GFX7: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT %19(s16)
+    ; GFX7: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FADD2]](s16)
+    ; GFX7: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[FADD3]](s16)
     ; GFX7: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
     ; GFX7: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX7: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX7: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX7: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
-    ; GFX7: [[FMUL:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC3]], [[TRUNC7]]
-    ; GFX7: [[FADD:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL]], [[TRUNC11]]
-    ; GFX7: [[FMUL1:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC2]], [[TRUNC6]]
-    ; GFX7: [[FADD1:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL1]], [[TRUNC10]]
-    ; GFX7: [[FMUL2:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC1]], [[TRUNC5]]
-    ; GFX7: [[FADD2:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL2]], [[TRUNC9]]
-    ; GFX7: [[FMUL3:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC]], [[TRUNC4]]
-    ; GFX7: [[FADD3:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL3]], [[TRUNC8]]
     ; GFX10-LABEL: name: test_fmad_v4s16_denorm_flags
     ; GFX10: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir
index 32738d499bda..95515f3593ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir
@@ -252,23 +252,23 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX6: $vgpr0 = COPY %3(s32)
     ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
+    ; GFX6: $vgpr0 = COPY [[FADD]](s32)
     ; GFX7-LABEL: name: test_fmad_s32_denorm
     ; GFX7: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX7: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX7: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX7: $vgpr0 = COPY %3(s32)
     ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX7: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
+    ; GFX7: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-LABEL: name: test_fmad_s32_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10: $vgpr0 = COPY %3(s32)
     ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
+    ; GFX10: $vgpr0 = COPY [[FADD]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -291,23 +291,23 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX6: $vgpr0 = COPY %3(s32)
     ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]]
     ; GFX6: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]]
+    ; GFX6: $vgpr0 = COPY [[FADD]](s32)
     ; GFX7-LABEL: name: test_fmad_s32_flags_denorm
     ; GFX7: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX7: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX7: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX7: $vgpr0 = COPY %3(s32)
     ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]]
     ; GFX7: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]]
+    ; GFX7: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-LABEL: name: test_fmad_s32_flags_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10: $vgpr0 = COPY %3(s32)
     ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]]
+    ; GFX10: $vgpr0 = COPY [[FADD]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -333,12 +333,12 @@ body: |
     ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
     ; GFX6: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
-    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV4]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]]
+    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; GFX6: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV5]]
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV4]]
     ; GFX7-LABEL: name: test_fmad_v2s32_denorm
     ; GFX7: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -346,12 +346,12 @@ body: |
     ; GFX7: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX7: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
     ; GFX7: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
-    ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32)
+    ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]]
+    ; GFX7: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV4]]
+    ; GFX7: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
+    ; GFX7: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]]
+    ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; GFX7: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
-    ; GFX7: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV5]]
-    ; GFX7: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]]
-    ; GFX7: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV4]]
     ; GFX10-LABEL: name: test_fmad_v2s32_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -359,12 +359,12 @@ body: |
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX10: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
     ; GFX10: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
-    ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32)
+    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]]
+    ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV4]]
+    ; GFX10: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
+    ; GFX10: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]]
+    ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; GFX10: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
-    ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV5]]
-    ; GFX10: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]]
-    ; GFX10: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV4]]
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
     %2:_(<2 x s32>) = COPY $vgpr4_vgpr5
@@ -390,14 +390,14 @@ body: |
     ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
     ; GFX6: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; GFX6: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>)
-    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR %13(s32), %14(s32), %15(s32)
-    ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]]
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV6]]
     ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV4]]
     ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV7]]
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]]
-    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV6]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
+    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]]
+    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32)
+    ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ; GFX7-LABEL: name: test_fmad_v3s32_denorm
     ; GFX7: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
     ; GFX7: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
@@ -405,14 +405,14 @@ body: |
     ; GFX7: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
     ; GFX7: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; GFX7: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>)
-    ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR %13(s32), %14(s32), %15(s32)
-    ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
-    ; GFX7: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]]
+    ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]]
+    ; GFX7: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV6]]
     ; GFX7: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV4]]
     ; GFX7: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV7]]
-    ; GFX7: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]]
-    ; GFX7: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV6]]
+    ; GFX7: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
+    ; GFX7: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]]
+    ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32)
+    ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-LABEL: name: test_fmad_v3s32_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
     ; GFX10: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
@@ -420,14 +420,14 @@ body: |
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
     ; GFX10: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; GFX10: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>)
-    ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR %13(s32), %14(s32), %15(s32)
-    ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
-    ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]]
+    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]]
+    ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV6]]
     ; GFX10: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV4]]
     ; GFX10: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV7]]
-    ; GFX10: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]]
-    ; GFX10: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV6]]
+    ; GFX10: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
+    ; GFX10: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]]
+    ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32)
+    ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
     %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
     %2:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8
@@ -453,16 +453,16 @@ body: |
     ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
     ; GFX6: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
     ; GFX6: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>)
-    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR %16(s32), %17(s32), %18(s32), %19(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV9]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]]
+    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV10]]
+    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]]
+    ; GFX6: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]]
+    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32)
     ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV11]]
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV10]]
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]]
-    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV9]]
-    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]]
-    ; GFX6: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV8]]
     ; GFX7-LABEL: name: test_fmad_v4s32_denorm
     ; GFX7: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX7: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
@@ -470,16 +470,16 @@ body: |
     ; GFX7: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
     ; GFX7: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
     ; GFX7: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>)
-    ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR %16(s32), %17(s32), %18(s32), %19(s32)
+    ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]]
+    ; GFX7: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]]
+    ; GFX7: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]]
+    ; GFX7: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV9]]
+    ; GFX7: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]]
+    ; GFX7: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV10]]
+    ; GFX7: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]]
+    ; GFX7: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]]
+    ; GFX7: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
-    ; GFX7: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]]
-    ; GFX7: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV11]]
-    ; GFX7: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]]
-    ; GFX7: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV10]]
-    ; GFX7: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]]
-    ; GFX7: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV9]]
-    ; GFX7: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]]
-    ; GFX7: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV8]]
     ; GFX10-LABEL: name: test_fmad_v4s32_denorm
     ; GFX10: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX10: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
@@ -487,16 +487,16 @@ body: |
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
     ; GFX10: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
     ; GFX10: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>)
-    ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR %16(s32), %17(s32), %18(s32), %19(s32)
+    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]]
+    ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]]
+    ; GFX10: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]]
+    ; GFX10: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV9]]
+    ; GFX10: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]]
+    ; GFX10: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV10]]
+    ; GFX10: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]]
+    ; GFX10: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]]
+    ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32)
     ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
-    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]]
-    ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV11]]
-    ; GFX10: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]]
-    ; GFX10: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV10]]
-    ; GFX10: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]]
-    ; GFX10: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV9]]
-    ; GFX10: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]]
-    ; GFX10: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV8]]
     %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
     %2:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
index 6230a3728273..d9204693d518 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
@@ -301,14 +301,18 @@ body: |
     ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %8(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %16(s32)
+    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX6: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -443,10 +447,6 @@ body: |
     ; GFX6: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX6: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX8-LABEL: name: test_udiv_s64
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -454,14 +454,18 @@ body: |
     ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %8(s32)
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %16(s32)
+    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX8: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -596,10 +600,6 @@ body: |
     ; GFX8: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX8: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
-    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX9-LABEL: name: test_udiv_s64
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -607,14 +607,18 @@ body: |
     ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %8(s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %16(s32)
+    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -749,10 +753,6 @@ body: |
     ; GFX9: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX9: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
-    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s64) = G_UDIV %0, %1
@@ -774,14 +774,18 @@ body: |
     ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV4]](s32)
     ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV5]](s32)
     ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %250(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %258(s32)
+    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX6: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX6: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -919,14 +923,18 @@ body: |
     ; GFX6: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[UV22]](s32)
     ; GFX6: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[UV23]](s32)
     ; GFX6: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX6: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C7]]
+    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[UITOFP2]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD2]](s32)
     ; GFX6: [[C8:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C8]]
+    ; GFX6: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C8]]
     ; GFX6: [[C9:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[C9]]
-    ; GFX6: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL3]]
+    ; GFX6: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FMUL5]], [[C9]]
+    ; GFX6: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL6]]
     ; GFX6: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX6: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C10]]
+    ; GFX6: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX6: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD3]](s32)
     ; GFX6: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC1]](s32)
     ; GFX6: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
     ; GFX6: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
@@ -1059,14 +1067,6 @@ body: |
     ; GFX6: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX6: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C10]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[FMUL2]]
-    ; GFX6: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C7]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL5]], [[UITOFP2]]
-    ; GFX6: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FMUL]]
-    ; GFX6: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX6: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[UITOFP]]
     ; GFX8-LABEL: name: test_udiv_v2s64
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
@@ -1076,14 +1076,18 @@ body: |
     ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV4]](s32)
     ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV5]](s32)
     ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %250(s32)
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %258(s32)
+    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX8: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX8: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -1221,14 +1225,18 @@ body: |
     ; GFX8: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[UV22]](s32)
     ; GFX8: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[UV23]](s32)
     ; GFX8: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX8: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C7]]
+    ; GFX8: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[UITOFP2]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD2]](s32)
     ; GFX8: [[C8:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C8]]
+    ; GFX8: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C8]]
     ; GFX8: [[C9:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[C9]]
-    ; GFX8: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL3]]
+    ; GFX8: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FMUL5]], [[C9]]
+    ; GFX8: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL6]]
     ; GFX8: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX8: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C10]]
+    ; GFX8: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX8: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD3]](s32)
     ; GFX8: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC1]](s32)
     ; GFX8: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
     ; GFX8: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
@@ -1361,14 +1369,6 @@ body: |
     ; GFX8: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX8: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C10]]
-    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[FMUL2]]
-    ; GFX8: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C7]]
-    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL5]], [[UITOFP2]]
-    ; GFX8: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX8: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FMUL]]
-    ; GFX8: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX8: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[UITOFP]]
     ; GFX9-LABEL: name: test_udiv_v2s64
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
@@ -1378,14 +1378,18 @@ body: |
     ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV4]](s32)
     ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV5]](s32)
     ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %250(s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %258(s32)
+    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -1523,14 +1527,18 @@ body: |
     ; GFX9: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[UV22]](s32)
     ; GFX9: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[UV23]](s32)
     ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX9: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C7]]
+    ; GFX9: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[UITOFP2]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD2]](s32)
     ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C8]]
+    ; GFX9: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C8]]
     ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[C9]]
-    ; GFX9: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL3]]
+    ; GFX9: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FMUL5]], [[C9]]
+    ; GFX9: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL6]]
     ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX9: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C10]]
+    ; GFX9: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX9: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD3]](s32)
     ; GFX9: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC1]](s32)
     ; GFX9: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
     ; GFX9: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
@@ -1663,14 +1671,6 @@ body: |
     ; GFX9: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX9: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C10]]
-    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[FMUL2]]
-    ; GFX9: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C7]]
-    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL5]], [[UITOFP2]]
-    ; GFX9: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX9: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FMUL]]
-    ; GFX9: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX9: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[UITOFP]]
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
     %2:_(<2 x s64>) = G_UDIV %0, %1
@@ -2300,14 +2300,18 @@ body: |
     ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
     ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C3]]
-    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C3]]
+    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX6: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C5]](s64)
@@ -2443,10 +2447,6 @@ body: |
     ; GFX6: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SELECT3]](s64)
     ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX8-LABEL: name: test_udiv_s33
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -2459,14 +2459,18 @@ body: |
     ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
+    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
     ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C3]]
-    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C3]]
+    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
+    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX8: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C5]](s64)
@@ -2602,10 +2606,6 @@ body: |
     ; GFX8: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SELECT3]](s64)
     ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
-    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
-    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
-    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX9-LABEL: name: test_udiv_s33
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -2618,14 +2618,18 @@ body: |
     ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
+    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
     ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C3]]
-    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C3]]
+    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
+    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C5]](s64)
@@ -2761,10 +2765,6 @@ body: |
     ; GFX9: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SELECT3]](s64)
     ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
-    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
-    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
-    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s33) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
index dfc52a8a7ce5..e42fe1400477 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
@@ -295,14 +295,18 @@ body: |
     ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %8(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %16(s32)
+    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX6: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -430,10 +434,6 @@ body: |
     ; GFX6: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX6: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX8-LABEL: name: test_urem_s64
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -441,14 +441,18 @@ body: |
     ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %8(s32)
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %16(s32)
+    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX8: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -576,10 +580,6 @@ body: |
     ; GFX8: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX8: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
-    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX9-LABEL: name: test_urem_s64
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -587,14 +587,18 @@ body: |
     ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %8(s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %16(s32)
+    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -722,10 +726,6 @@ body: |
     ; GFX9: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX9: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
-    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s64) = G_UREM %0, %1
@@ -747,14 +747,18 @@ body: |
     ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV4]](s32)
     ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV5]](s32)
     ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %234(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %242(s32)
+    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX6: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX6: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -885,14 +889,18 @@ body: |
     ; GFX6: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[UV18]](s32)
     ; GFX6: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[UV19]](s32)
     ; GFX6: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX6: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C6]]
+    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[UITOFP2]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD2]](s32)
     ; GFX6: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C7]]
+    ; GFX6: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C7]]
     ; GFX6: [[C8:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[C8]]
-    ; GFX6: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL3]]
+    ; GFX6: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FMUL5]], [[C8]]
+    ; GFX6: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL6]]
     ; GFX6: [[C9:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX6: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C9]]
+    ; GFX6: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX6: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD3]](s32)
     ; GFX6: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC1]](s32)
     ; GFX6: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
     ; GFX6: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
@@ -1019,14 +1027,6 @@ body: |
     ; GFX6: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX6: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C9]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[FMUL2]]
-    ; GFX6: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C6]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL5]], [[UITOFP2]]
-    ; GFX6: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX6: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FMUL]]
-    ; GFX6: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX6: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[UITOFP]]
     ; GFX8-LABEL: name: test_urem_v2s64
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
@@ -1036,14 +1036,18 @@ body: |
     ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV4]](s32)
     ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV5]](s32)
     ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %234(s32)
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %242(s32)
+    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX8: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX8: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -1174,14 +1178,18 @@ body: |
     ; GFX8: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[UV18]](s32)
     ; GFX8: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[UV19]](s32)
     ; GFX8: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX8: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C6]]
+    ; GFX8: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[UITOFP2]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD2]](s32)
     ; GFX8: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C7]]
+    ; GFX8: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C7]]
     ; GFX8: [[C8:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[C8]]
-    ; GFX8: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL3]]
+    ; GFX8: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FMUL5]], [[C8]]
+    ; GFX8: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL6]]
     ; GFX8: [[C9:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX8: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C9]]
+    ; GFX8: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX8: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD3]](s32)
     ; GFX8: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC1]](s32)
     ; GFX8: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
     ; GFX8: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
@@ -1308,14 +1316,6 @@ body: |
     ; GFX8: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX8: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C9]]
-    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[FMUL2]]
-    ; GFX8: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C6]]
-    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL5]], [[UITOFP2]]
-    ; GFX8: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX8: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FMUL]]
-    ; GFX8: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX8: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[UITOFP]]
     ; GFX9-LABEL: name: test_urem_v2s64
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
@@ -1325,14 +1325,18 @@ body: |
     ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV4]](s32)
     ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV5]](s32)
     ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %234(s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
+    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C2]]
-    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C2]]
+    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %242(s32)
+    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
+    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
@@ -1463,14 +1467,18 @@ body: |
     ; GFX9: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[UV18]](s32)
     ; GFX9: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[UV19]](s32)
     ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX9: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C6]]
+    ; GFX9: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[UITOFP2]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD2]](s32)
     ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C7]]
+    ; GFX9: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C7]]
     ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[C8]]
-    ; GFX9: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL3]]
+    ; GFX9: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FMUL5]], [[C8]]
+    ; GFX9: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL6]]
     ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX9: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C9]]
+    ; GFX9: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX9: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD3]](s32)
     ; GFX9: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC1]](s32)
     ; GFX9: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
     ; GFX9: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
@@ -1597,14 +1605,6 @@ body: |
     ; GFX9: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX9: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC1]], [[C9]]
-    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL4]], [[FMUL2]]
-    ; GFX9: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP3]], [[C6]]
-    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL5]], [[UITOFP2]]
-    ; GFX9: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C3]]
-    ; GFX9: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FMUL]]
-    ; GFX9: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C]]
-    ; GFX9: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[UITOFP]]
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7
     %2:_(<2 x s64>) = G_UREM %0, %1
@@ -2222,14 +2222,18 @@ body: |
     ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
+    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
     ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C3]]
-    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C3]]
+    ; GFX6: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
+    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX6: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C5]](s64)
@@ -2358,10 +2362,6 @@ body: |
     ; GFX6: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SELECT3]](s64)
     ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
-    ; GFX6: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
-    ; GFX6: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX6: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
-    ; GFX6: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX8-LABEL: name: test_urem_s33
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -2374,14 +2374,18 @@ body: |
     ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
+    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
     ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C3]]
-    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C3]]
+    ; GFX8: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
+    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX8: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C5]](s64)
@@ -2510,10 +2514,6 @@ body: |
     ; GFX8: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SELECT3]](s64)
     ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
-    ; GFX8: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
-    ; GFX8: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX8: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
-    ; GFX8: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     ; GFX9-LABEL: name: test_urem_s33
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
@@ -2526,14 +2526,18 @@ body: |
     ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32)
     ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32)
     ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
-    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG %14(s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
+    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UITOFP]]
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FADD]](s32)
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x43EFFFFF80000000
-    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
     ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3DF0000000000000
-    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[C3]]
-    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL1]]
+    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[C3]]
+    ; GFX9: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FMUL2]]
     ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC1F0000000000000
-    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI %22(s32)
+    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
+    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FMUL1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s32)
     ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[INTRINSIC_TRUNC]](s32)
     ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C5]](s64)
@@ -2662,10 +2666,6 @@ body: |
     ; GFX9: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SELECT3]](s64)
     ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
-    ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INTRINSIC_TRUNC]], [[C4]]
-    ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
-    ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UITOFP1]], [[C1]]
-    ; GFX9: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UITOFP]]
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s33) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll
index f6d16e87dd02..d200f7c5c306 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll
@@ -6,20 +6,17 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX9-LABEL: sample_l_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
-; GFX9-NEXT:    s_mov_b32 s2, s4
-; GFX9-NEXT:    s_mov_b32 s4, s6
-; GFX9-NEXT:    s_mov_b32 s6, s8
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s12, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s1, s3
+; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
 ; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    s_mov_b32 s8, s10
 ; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v1, s12
 ; GFX9-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -27,19 +24,17 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-LABEL: sample_l_1d:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s0, s2
-; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s4, s6
-; GFX10-NEXT:    s_mov_b32 s6, s8
-; GFX10-NEXT:    s_mov_b32 s8, s10
-; GFX10-NEXT:    s_mov_b32 s10, s12
-; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
+; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    s_mov_b32 s9, s11
+; GFX10-NEXT:    s_mov_b32 s10, s12
 ; GFX10-NEXT:    s_mov_b32 s11, s13
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xffff, s12
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll
index 9f638dbb4d7a..516e92e08b16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll
@@ -1,404 +1,277 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=legalizer -o - %s | FileCheck -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
-  ; GCN-LABEL: name: sample_l_1d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[COPY12]](s32), 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_l_1d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: sample_l_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_l_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float -0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) {
-  ; GCN-LABEL: name: sample_c_l_1d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_c_l_1d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float -2.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: sample_c_l_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_c_l_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %lod) {
-  ; GCN-LABEL: name: sample_l_o_1d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.o.1d), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_l_o_1d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: sample_l_o_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.o.2d), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_l_o_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %lod) {
-  ; GCN-LABEL: name: sample_c_l_o_1d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.o.1d), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_c_l_o_1d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: sample_c_l_o_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.o.2d), 15, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: sample_c_l_o_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_sample_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: gather4_l_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.2d), 15, [[BUILD_VECTOR2]](<2 x s32>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: gather4_l_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 15, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: gather4_c_l_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.2d), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: gather4_c_l_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: gather4_l_o_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.l.o.2d), 15, [[BUILD_VECTOR2]](<3 x s32>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: gather4_l_o_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_gather4_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) {
-  ; GCN-LABEL: name: gather4_c_l_o_2d
-  ; GCN: bb.1.main_body:
-  ; GCN:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-  ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
-  ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
-  ; GCN:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
-  ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
-  ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
-  ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
-  ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
-  ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
-  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
-  ; GCN:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
-  ; GCN:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
-  ; GCN:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-  ; GCN:   [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
-  ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.c.l.o.2d), 15, [[BUILD_VECTOR2]](<4 x s32>), $noreg, $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
-  ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
-  ; GCN:   $vgpr0 = COPY [[UV]](s32)
-  ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   $vgpr2 = COPY [[UV2]](s32)
-  ; GCN:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+; GCN-LABEL: gather4_c_l_o_2d:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
   ret <4 x float> %v
diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll
index 900521664e0c..db5fb63f4e76 100644
--- a/llvm/test/CodeGen/NVPTX/fast-math.ll
+++ b/llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -13,7 +13,7 @@ define float @sqrt_div(float %a, float %b) {
 }
 
 ; CHECK-LABEL: sqrt_div_fast(
-; CHECK: sqrt.approx.f32
+; CHECK: sqrt.rn.f32
 ; CHECK: div.approx.f32
 define float @sqrt_div_fast(float %a, float %b) #0 {
   %t1 = tail call float @llvm.sqrt.f32(float %a)
@@ -21,6 +21,15 @@ define float @sqrt_div_fast(float %a, float %b) #0 {
   ret float %t2
 }
 
+; CHECK-LABEL: sqrt_div_fast_ninf(
+; CHECK: sqrt.approx.f32
+; CHECK: div.approx.f32
+define float @sqrt_div_fast_ninf(float %a, float %b) #0 {
+  %t1 = tail call ninf float @llvm.sqrt.f32(float %a)
+  %t2 = fdiv float %t1, %b
+  ret float %t2
+}
+
 ; CHECK-LABEL: sqrt_div_ftz(
 ; CHECK: sqrt.rn.ftz.f32
 ; CHECK: div.rn.ftz.f32
@@ -31,7 +40,7 @@ define float @sqrt_div_ftz(float %a, float %b) #1 {
 }
 
 ; CHECK-LABEL: sqrt_div_fast_ftz(
-; CHECK: sqrt.approx.ftz.f32
+; CHECK: sqrt.rn.ftz.f32
 ; CHECK: div.approx.ftz.f32
 define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 {
   %t1 = tail call float @llvm.sqrt.f32(float %a)
@@ -39,12 +48,20 @@ define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 {
   ret float %t2
 }
 
+; CHECK-LABEL: sqrt_div_fast_ftz_ninf(
+; CHECK: sqrt.approx.ftz.f32
+; CHECK: div.approx.ftz.f32
+define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 {
+  %t1 = tail call ninf float @llvm.sqrt.f32(float %a)
+  %t2 = fdiv float %t1, %b
+  ret float %t2
+}
+
 ; There are no fast-math or ftz versions of sqrt and div for f64.  We use
 ; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide.
 
 ; CHECK-LABEL: sqrt_div_fast_ftz_f64(
-; CHECK: rsqrt.approx.f64
-; CHECK: rcp.approx.ftz.f64
+; CHECK: sqrt.rn.f64
 ; CHECK: div.rn.f64
 define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
   %t1 = tail call double @llvm.sqrt.f64(double %a)
@@ -52,6 +69,16 @@ define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
   ret double %t2
 }
 
+; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf(
+; CHECK: rsqrt.approx.f64
+; CHECK: rcp.approx.ftz.f64
+; CHECK: div.rn.f64
+define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #0 #1 {
+  %t1 = tail call ninf double @llvm.sqrt.f64(double %a)
+  %t2 = fdiv double %t1, %b
+  ret double %t2
+}
+
 ; CHECK-LABEL: rsqrt(
 ; CHECK-NOT: rsqrt.approx
 ; CHECK: sqrt.rn.f32
diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
index a8590b7c43ab..465b696c7610 100644
--- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -45,35 +45,63 @@ define double @test_rsqrt64_ftz(double %a) #0 #1 {
 
 ; CHECK-LABEL test_sqrt32
 define float @test_sqrt32(float %a) #0 {
-; CHECK: sqrt.approx.f32
+; CHECK: sqrt.rn.f32
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
+; CHECK-LABEL test_sqrt32_ninf
+define float @test_sqrt32_ninf(float %a) #0 {
+; CHECK: sqrt.approx.f32
+  %ret = tail call ninf float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
 ; CHECK-LABEL test_sqrt_ftz
 define float @test_sqrt_ftz(float %a) #0 #1 {
-; CHECK: sqrt.approx.ftz.f32
+; CHECK: sqrt.rn.ftz.f32
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
+; CHECK-LABEL test_sqrt_ftz_ninf
+define float @test_sqrt_ftz_ninf(float %a) #0 #1 {
+; CHECK: sqrt.approx.ftz.f32
+  %ret = tail call ninf float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
 ; CHECK-LABEL test_sqrt64
 define double @test_sqrt64(double %a) #0 {
+; CHECK: sqrt.rn.f64
+  %ret = tail call double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
+; CHECK-LABEL test_sqrt64_ninf
+define double @test_sqrt64_ninf(double %a) #0 {
 ; There's no sqrt.approx.f64 instruction; we emit
 ; reciprocal(rsqrt.approx.f64(x)).  There's no non-ftz approximate reciprocal,
 ; so we just use the ftz version.
 ; CHECK: rsqrt.approx.f64
 ; CHECK: rcp.approx.ftz.f64
-  %ret = tail call double @llvm.sqrt.f64(double %a)
+  %ret = tail call ninf double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
 ; CHECK-LABEL test_sqrt64_ftz
 define double @test_sqrt64_ftz(double %a) #0 #1 {
+; CHECK: sqrt.rn.f64
+  %ret = tail call double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
+; CHECK-LABEL test_sqrt64_ftz_ninf
+define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
 ; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
 ; CHECK: rsqrt.approx.f64
 ; CHECK: rcp.approx.ftz.f64
-  %ret = tail call double @llvm.sqrt.f64(double %a)
+  %ret = tail call ninf double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
@@ -92,11 +120,18 @@ define float @test_rsqrt32_refined(float %a) #0 #2 {
 
 ; CHECK-LABEL: test_sqrt32_refined
 define float @test_sqrt32_refined(float %a) #0 #2 {
-; CHECK: rsqrt.approx.f32
+; CHECK: sqrt.rn.f32
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
+; CHECK-LABEL: test_sqrt32_refined_ninf
+define float @test_sqrt32_refined_ninf(float %a) #0 #2 {
+; CHECK: rsqrt.approx.f32
+  %ret = tail call ninf float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
 ; CHECK-LABEL: test_rsqrt64_refined
 define double @test_rsqrt64_refined(double %a) #0 #2 {
 ; CHECK: rsqrt.approx.f64
@@ -107,11 +142,18 @@ define double @test_rsqrt64_refined(double %a) #0 #2 {
 
 ; CHECK-LABEL: test_sqrt64_refined
 define double @test_sqrt64_refined(double %a) #0 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK: sqrt.rn.f64
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
+; CHECK-LABEL: test_sqrt64_refined_ninf
+define double @test_sqrt64_refined_ninf(double %a) #0 #2 {
+; CHECK: rsqrt.approx.f64
+  %ret = tail call ninf double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
 ; -- refined sqrt and rsqrt with ftz enabled --
 
 ; CHECK-LABEL: test_rsqrt32_refined_ftz
@@ -124,11 +166,18 @@ define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 {
 
 ; CHECK-LABEL: test_sqrt32_refined_ftz
 define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.ftz.f32
+; CHECK: sqrt.rn.ftz.f32
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
+; CHECK-LABEL: test_sqrt32_refined_ftz_ninf
+define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 {
+; CHECK: rsqrt.approx.ftz.f32
+  %ret = tail call ninf float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
 ; CHECK-LABEL: test_rsqrt64_refined_ftz
 define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
 ; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version.
@@ -140,11 +189,18 @@ define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
 
 ; CHECK-LABEL: test_sqrt64_refined_ftz
 define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK: sqrt.rn.f64
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
+; CHECK-LABEL: test_sqrt64_refined_ftz_ninf
+define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 {
+; CHECK: rsqrt.approx.f64
+  %ret = tail call ninf double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
 attributes #0 = { "unsafe-fp-math" = "true" }
 attributes #1 = { "denormal-fp-math-f32" = "preserve-sign,preserve-sign" }
 attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" }
diff --git a/llvm/test/CodeGen/PowerPC/aix-return55.ll b/llvm/test/CodeGen/PowerPC/aix-return55.ll
index a09857169a30..a18a211b46b2 100644
--- a/llvm/test/CodeGen/PowerPC/aix-return55.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-return55.ll
@@ -31,7 +31,7 @@ entry:
 ;CHECKOBJ-NEXT:      18: 00 01 23 45                   <unknown>
 ;CHECKOBJ-NEXT:      1c: 67 8a bc de                   oris 10, 28, 48350{{[[:space:]] *}}
 ;CHECKOBJ-NEXT: 00000020 <d>:
-;CHECKOBJ-NEXT:      20: 40 14 00 00                   bdnzf   20, .+0
+;CHECKOBJ-NEXT:      20: 40 14 00 00                   bdnzf   20, 0x20
 ;CHECKOBJ-NEXT:      24: 00 00 00 00                   <unknown>{{[[:space:]] *}}
 ;CHECKOBJ-NEXT: 00000028 <foo>:
 ;CHECKOBJ-NEXT:      28: 00 00 00 00                   <unknown>
diff --git a/llvm/test/CodeGen/PowerPC/alignlongjumptest.mir b/llvm/test/CodeGen/PowerPC/alignlongjumptest.mir
index 2ec09b0fb26b..56ddb2dc033b 100644
--- a/llvm/test/CodeGen/PowerPC/alignlongjumptest.mir
+++ b/llvm/test/CodeGen/PowerPC/alignlongjumptest.mir
@@ -70,12 +70,12 @@ body:             |
 ...
 
 # Check for the long branch.
-# CHECK-LE:         08 00 82 4{{[01]}}   b{{[tf]}}  2, .+8
+# CHECK-LE:         08 00 82 4{{[01]}}   b{{[tf]}}  2, 0xc
 # CHECK-LE-NEXT:    fc 7f 00 48   b .+32764
 # CHECK-LE-DAG:     paddi 3, 3, 13, 0
 # CHECK-LE-DAG:     paddi 3, 3, 21, 0
 # CHECK-LE:         blr
-# CHECK-BE:         4{{[01]}} 82 00 08   b{{[tf]}}  2, .+8
+# CHECK-BE:         4{{[01]}} 82 00 08   b{{[tf]}}  2, 0xc
 # CHECK-BE-NEXT:    48 00 7f fc   b .+32764
 # CHECK-BE-DAG:     paddi 3, 3, 13, 0
 # CHECK-BE-DAG:     paddi 3, 3, 21, 0
diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
index 222583638d59..59a7d233c0c3 100644
--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@@ -270,11 +270,11 @@ define float @fmul_fma_fast2(float %x) {
 ; Reduced precision for sqrt is allowed - should use estimate and NR iterations.
 
 ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_ieee:'
-; FMFDEBUG:         fmul afn {{t[0-9]+}}
+; FMFDEBUG:         fmul ninf afn {{t[0-9]+}}
 ; FMFDEBUG:       Type-legalized selection DAG: %bb.0 'sqrt_afn_ieee:'
 
 ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_ieee:'
-; GLOBALDEBUG:         fmul afn {{t[0-9]+}}
+; GLOBALDEBUG:         fmul ninf afn {{t[0-9]+}}
 ; GLOBALDEBUG:       Type-legalized selection DAG: %bb.0 'sqrt_afn_ieee:'
 
 define float @sqrt_afn_ieee(float %x) #0 {
@@ -321,17 +321,31 @@ define float @sqrt_afn_ieee(float %x) #0 {
 ; GLOBAL-NEXT:    xsmulsp 0, 0, 2
 ; GLOBAL-NEXT:  .LBB10_2:
 ; GLOBAL-NEXT:    fmr 1, 0
+; GLOBAL-NEXT:    blr
+  %rt = call afn ninf float @llvm.sqrt.f32(float %x)
+  ret float %rt
+}
+
+define float @sqrt_afn_ieee_inf(float %x) #0 {
+; FMF-LABEL: sqrt_afn_ieee_inf:
+; FMF:       # %bb.0:
+; FMF-NEXT:    xssqrtsp 1, 1
+; FMF-NEXT:    blr
+;
+; GLOBAL-LABEL: sqrt_afn_ieee_inf:
+; GLOBAL:       # %bb.0:
+; GLOBAL-NEXT:    xssqrtsp 1, 1
 ; GLOBAL-NEXT:    blr
   %rt = call afn float @llvm.sqrt.f32(float %x)
   ret float %rt
 }
 
 ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_preserve_sign:'
-; FMFDEBUG:         fmul afn {{t[0-9]+}}
+; FMFDEBUG:         fmul ninf afn {{t[0-9]+}}
 ; FMFDEBUG:       Type-legalized selection DAG: %bb.0 'sqrt_afn_preserve_sign:'
 
 ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_preserve_sign:'
-; GLOBALDEBUG:         fmul afn {{t[0-9]+}}
+; GLOBALDEBUG:         fmul ninf afn {{t[0-9]+}}
 ; GLOBALDEBUG:       Type-legalized selection DAG: %bb.0 'sqrt_afn_preserve_sign:'
 
 define float @sqrt_afn_preserve_sign(float %x) #1 {
@@ -339,19 +353,19 @@ define float @sqrt_afn_preserve_sign(float %x) #1 {
 ; FMF:       # %bb.0:
 ; FMF-NEXT:    xxlxor 0, 0, 0
 ; FMF-NEXT:    fcmpu 0, 1, 0
-; FMF-NEXT:    beq 0, .LBB11_2
+; FMF-NEXT:    beq 0, .LBB12_2
 ; FMF-NEXT:  # %bb.1:
 ; FMF-NEXT:    xsrsqrtesp 0, 1
-; FMF-NEXT:    addis 3, 2, .LCPI11_0@toc@ha
-; FMF-NEXT:    addis 4, 2, .LCPI11_1@toc@ha
-; FMF-NEXT:    lfs 2, .LCPI11_0@toc@l(3)
-; FMF-NEXT:    lfs 3, .LCPI11_1@toc@l(4)
+; FMF-NEXT:    addis 3, 2, .LCPI12_0@toc@ha
+; FMF-NEXT:    addis 4, 2, .LCPI12_1@toc@ha
+; FMF-NEXT:    lfs 2, .LCPI12_0@toc@l(3)
+; FMF-NEXT:    lfs 3, .LCPI12_1@toc@l(4)
 ; FMF-NEXT:    xsmulsp 1, 1, 0
 ; FMF-NEXT:    xsmulsp 0, 1, 0
 ; FMF-NEXT:    xsmulsp 1, 1, 2
 ; FMF-NEXT:    xsaddsp 0, 0, 3
 ; FMF-NEXT:    xsmulsp 0, 1, 0
-; FMF-NEXT:  .LBB11_2:
+; FMF-NEXT:  .LBB12_2:
 ; FMF-NEXT:    fmr 1, 0
 ; FMF-NEXT:    blr
 ;
@@ -359,19 +373,33 @@ define float @sqrt_afn_preserve_sign(float %x) #1 {
 ; GLOBAL:       # %bb.0:
 ; GLOBAL-NEXT:    xxlxor 0, 0, 0
 ; GLOBAL-NEXT:    fcmpu 0, 1, 0
-; GLOBAL-NEXT:    beq 0, .LBB11_2
+; GLOBAL-NEXT:    beq 0, .LBB12_2
 ; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    xsrsqrtesp 0, 1
-; GLOBAL-NEXT:    addis 3, 2, .LCPI11_0@toc@ha
-; GLOBAL-NEXT:    addis 4, 2, .LCPI11_1@toc@ha
-; GLOBAL-NEXT:    lfs 2, .LCPI11_0@toc@l(3)
-; GLOBAL-NEXT:    lfs 3, .LCPI11_1@toc@l(4)
+; GLOBAL-NEXT:    addis 3, 2, .LCPI12_0@toc@ha
+; GLOBAL-NEXT:    addis 4, 2, .LCPI12_1@toc@ha
+; GLOBAL-NEXT:    lfs 2, .LCPI12_0@toc@l(3)
+; GLOBAL-NEXT:    lfs 3, .LCPI12_1@toc@l(4)
 ; GLOBAL-NEXT:    xsmulsp 1, 1, 0
 ; GLOBAL-NEXT:    xsmaddasp 2, 1, 0
 ; GLOBAL-NEXT:    xsmulsp 0, 1, 3
 ; GLOBAL-NEXT:    xsmulsp 0, 0, 2
-; GLOBAL-NEXT:  .LBB11_2:
+; GLOBAL-NEXT:  .LBB12_2:
 ; GLOBAL-NEXT:    fmr 1, 0
+; GLOBAL-NEXT:    blr
+  %rt = call afn ninf float @llvm.sqrt.f32(float %x)
+  ret float %rt
+}
+
+define float @sqrt_afn_preserve_sign_inf(float %x) #1 {
+; FMF-LABEL: sqrt_afn_preserve_sign_inf:
+; FMF:       # %bb.0:
+; FMF-NEXT:    xssqrtsp 1, 1
+; FMF-NEXT:    blr
+;
+; GLOBAL-LABEL: sqrt_afn_preserve_sign_inf:
+; GLOBAL:       # %bb.0:
+; GLOBAL-NEXT:    xssqrtsp 1, 1
 ; GLOBAL-NEXT:    blr
   %rt = call afn float @llvm.sqrt.f32(float %x)
   ret float %rt
@@ -390,45 +418,45 @@ define float @sqrt_afn_preserve_sign(float %x) #1 {
 define float @sqrt_fast_ieee(float %x) #0 {
 ; FMF-LABEL: sqrt_fast_ieee:
 ; FMF:       # %bb.0:
-; FMF-NEXT:    addis 3, 2, .LCPI12_2@toc@ha
+; FMF-NEXT:    addis 3, 2, .LCPI14_2@toc@ha
 ; FMF-NEXT:    fabs 0, 1
-; FMF-NEXT:    lfs 2, .LCPI12_2@toc@l(3)
+; FMF-NEXT:    lfs 2, .LCPI14_2@toc@l(3)
 ; FMF-NEXT:    fcmpu 0, 0, 2
 ; FMF-NEXT:    xxlxor 0, 0, 0
-; FMF-NEXT:    blt 0, .LBB12_2
+; FMF-NEXT:    blt 0, .LBB14_2
 ; FMF-NEXT:  # %bb.1:
 ; FMF-NEXT:    xsrsqrtesp 0, 1
-; FMF-NEXT:    addis 3, 2, .LCPI12_0@toc@ha
-; FMF-NEXT:    addis 4, 2, .LCPI12_1@toc@ha
-; FMF-NEXT:    lfs 2, .LCPI12_0@toc@l(3)
-; FMF-NEXT:    lfs 3, .LCPI12_1@toc@l(4)
+; FMF-NEXT:    addis 3, 2, .LCPI14_0@toc@ha
+; FMF-NEXT:    addis 4, 2, .LCPI14_1@toc@ha
+; FMF-NEXT:    lfs 2, .LCPI14_0@toc@l(3)
+; FMF-NEXT:    lfs 3, .LCPI14_1@toc@l(4)
 ; FMF-NEXT:    xsmulsp 1, 1, 0
 ; FMF-NEXT:    xsmaddasp 2, 1, 0
 ; FMF-NEXT:    xsmulsp 0, 1, 3
 ; FMF-NEXT:    xsmulsp 0, 0, 2
-; FMF-NEXT:  .LBB12_2:
+; FMF-NEXT:  .LBB14_2:
 ; FMF-NEXT:    fmr 1, 0
 ; FMF-NEXT:    blr
 ;
 ; GLOBAL-LABEL: sqrt_fast_ieee:
 ; GLOBAL:       # %bb.0:
-; GLOBAL-NEXT:    addis 3, 2, .LCPI12_2@toc@ha
+; GLOBAL-NEXT:    addis 3, 2, .LCPI14_2@toc@ha
 ; GLOBAL-NEXT:    fabs 0, 1
-; GLOBAL-NEXT:    lfs 2, .LCPI12_2@toc@l(3)
+; GLOBAL-NEXT:    lfs 2, .LCPI14_2@toc@l(3)
 ; GLOBAL-NEXT:    fcmpu 0, 0, 2
 ; GLOBAL-NEXT:    xxlxor 0, 0, 0
-; GLOBAL-NEXT:    blt 0, .LBB12_2
+; GLOBAL-NEXT:    blt 0, .LBB14_2
 ; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    xsrsqrtesp 0, 1
-; GLOBAL-NEXT:    addis 3, 2, .LCPI12_0@toc@ha
-; GLOBAL-NEXT:    addis 4, 2, .LCPI12_1@toc@ha
-; GLOBAL-NEXT:    lfs 2, .LCPI12_0@toc@l(3)
-; GLOBAL-NEXT:    lfs 3, .LCPI12_1@toc@l(4)
+; GLOBAL-NEXT:    addis 3, 2, .LCPI14_0@toc@ha
+; GLOBAL-NEXT:    addis 4, 2, .LCPI14_1@toc@ha
+; GLOBAL-NEXT:    lfs 2, .LCPI14_0@toc@l(3)
+; GLOBAL-NEXT:    lfs 3, .LCPI14_1@toc@l(4)
 ; GLOBAL-NEXT:    xsmulsp 1, 1, 0
 ; GLOBAL-NEXT:    xsmaddasp 2, 1, 0
 ; GLOBAL-NEXT:    xsmulsp 0, 1, 3
 ; GLOBAL-NEXT:    xsmulsp 0, 0, 2
-; GLOBAL-NEXT:  .LBB12_2:
+; GLOBAL-NEXT:  .LBB14_2:
 ; GLOBAL-NEXT:    fmr 1, 0
 ; GLOBAL-NEXT:    blr
   %rt = call fast float @llvm.sqrt.f32(float %x)
@@ -450,18 +478,18 @@ define float @sqrt_fast_preserve_sign(float %x) #1 {
 ; FMF:       # %bb.0:
 ; FMF-NEXT:    xxlxor 0, 0, 0
 ; FMF-NEXT:    fcmpu 0, 1, 0
-; FMF-NEXT:    beq 0, .LBB13_2
+; FMF-NEXT:    beq 0, .LBB15_2
 ; FMF-NEXT:  # %bb.1:
 ; FMF-NEXT:    xsrsqrtesp 0, 1
-; FMF-NEXT:    addis 3, 2, .LCPI13_0@toc@ha
-; FMF-NEXT:    addis 4, 2, .LCPI13_1@toc@ha
-; FMF-NEXT:    lfs 2, .LCPI13_0@toc@l(3)
-; FMF-NEXT:    lfs 3, .LCPI13_1@toc@l(4)
+; FMF-NEXT:    addis 3, 2, .LCPI15_0@toc@ha
+; FMF-NEXT:    addis 4, 2, .LCPI15_1@toc@ha
+; FMF-NEXT:    lfs 2, .LCPI15_0@toc@l(3)
+; FMF-NEXT:    lfs 3, .LCPI15_1@toc@l(4)
 ; FMF-NEXT:    xsmulsp 1, 1, 0
 ; FMF-NEXT:    xsmaddasp 2, 1, 0
 ; FMF-NEXT:    xsmulsp 0, 1, 3
 ; FMF-NEXT:    xsmulsp 0, 0, 2
-; FMF-NEXT:  .LBB13_2:
+; FMF-NEXT:  .LBB15_2:
 ; FMF-NEXT:    fmr 1, 0
 ; FMF-NEXT:    blr
 ;
@@ -469,18 +497,18 @@ define float @sqrt_fast_preserve_sign(float %x) #1 {
 ; GLOBAL:       # %bb.0:
 ; GLOBAL-NEXT:    xxlxor 0, 0, 0
 ; GLOBAL-NEXT:    fcmpu 0, 1, 0
-; GLOBAL-NEXT:    beq 0, .LBB13_2
+; GLOBAL-NEXT:    beq 0, .LBB15_2
 ; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    xsrsqrtesp 0, 1
-; GLOBAL-NEXT:    addis 3, 2, .LCPI13_0@toc@ha
-; GLOBAL-NEXT:    addis 4, 2, .LCPI13_1@toc@ha
-; GLOBAL-NEXT:    lfs 2, .LCPI13_0@toc@l(3)
-; GLOBAL-NEXT:    lfs 3, .LCPI13_1@toc@l(4)
+; GLOBAL-NEXT:    addis 3, 2, .LCPI15_0@toc@ha
+; GLOBAL-NEXT:    addis 4, 2, .LCPI15_1@toc@ha
+; GLOBAL-NEXT:    lfs 2, .LCPI15_0@toc@l(3)
+; GLOBAL-NEXT:    lfs 3, .LCPI15_1@toc@l(4)
 ; GLOBAL-NEXT:    xsmulsp 1, 1, 0
 ; GLOBAL-NEXT:    xsmaddasp 2, 1, 0
 ; GLOBAL-NEXT:    xsmulsp 0, 1, 3
 ; GLOBAL-NEXT:    xsmulsp 0, 0, 2
-; GLOBAL-NEXT:  .LBB13_2:
+; GLOBAL-NEXT:  .LBB15_2:
 ; GLOBAL-NEXT:    fmr 1, 0
 ; GLOBAL-NEXT:    blr
   %rt = call fast float @llvm.sqrt.f32(float %x)
@@ -502,10 +530,10 @@ define double @fcmp_nnan(double %a, double %y, double %z) {
 ; FMF:       # %bb.0:
 ; FMF-NEXT:    xxlxor 0, 0, 0
 ; FMF-NEXT:    xscmpudp 0, 1, 0
-; FMF-NEXT:    blt 0, .LBB14_2
+; FMF-NEXT:    blt 0, .LBB16_2
 ; FMF-NEXT:  # %bb.1:
 ; FMF-NEXT:    fmr 3, 2
-; FMF-NEXT:  .LBB14_2:
+; FMF-NEXT:  .LBB16_2:
 ; FMF-NEXT:    fmr 1, 3
 ; FMF-NEXT:    blr
 ;
@@ -513,10 +541,10 @@ define double @fcmp_nnan(double %a, double %y, double %z) {
 ; GLOBAL:       # %bb.0:
 ; GLOBAL-NEXT:    xxlxor 0, 0, 0
 ; GLOBAL-NEXT:    xscmpudp 0, 1, 0
-; GLOBAL-NEXT:    blt 0, .LBB14_2
+; GLOBAL-NEXT:    blt 0, .LBB16_2
 ; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    fmr 3, 2
-; GLOBAL-NEXT:  .LBB14_2:
+; GLOBAL-NEXT:  .LBB16_2:
 ; GLOBAL-NEXT:    fmr 1, 3
 ; GLOBAL-NEXT:    blr
   %cmp = fcmp nnan ult double %a, 0.0
diff --git a/llvm/test/CodeGen/PowerPC/mi-simplify-code.mir b/llvm/test/CodeGen/PowerPC/mi-simplify-code.mir
new file mode 100644
index 000000000000..15c1c4e1ef7e
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/mi-simplify-code.mir
@@ -0,0 +1,63 @@
+# RUN: llc -mtriple powerpc64le-unknown-linux-gnu -mcpu=pwr8 -x mir < %s \
+# RUN:   -verify-machineinstrs -start-before=ppc-mi-peepholes | FileCheck %s
+
+---
+name: remove_frsp
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4
+
+    %1:g8rc = COPY $x4
+    %0:g8rc_and_g8rc_nox0 = COPY $x3
+    %2:g8rc = RLDICR %1, 2, 61
+    %3:f8rc, %4:g8rc_and_g8rc_nox0 = LFSUX %0, killed %2
+    %5:f4rc = FRSP killed %3, implicit $rm
+    %22:vslrc = SUBREG_TO_REG 1, %5, %subreg.sub_64
+    %7:g8rc = LI8 8
+    %8:vssrc = XFLOADf32 %4, killed %7
+    %23:vslrc = SUBREG_TO_REG 1, %8, %subreg.sub_64
+    %10:vsrc = XXPERMDI %23, %22, 0
+    %11:vrrc = XVCVDPSP killed %10, implicit $rm
+    $v2 = COPY %11
+    BLR8 implicit $lr8, implicit $rm, implicit $v2
+...
+# CHECK-LABEL: remove_frsp
+# CHECK: sldi 4, 4, 2
+# CHECK-NEXT: lfsux 0, 3, 4
+# CHECK-NOT: frsp
+# CHECK-NEXT: lfs 1, 8(3)
+# CHECK-NEXT: xxmrghd	0, 1, 0
+# CHECK-NEXT: xvcvdpsp 34, 0
+# CHECK-NEXT: blr
+
+---
+name: remove_xsrsp
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4
+
+    %1:g8rc = COPY $x4
+    %0:g8rc_and_g8rc_nox0 = COPY $x3
+    %2:g8rc = RLDICR %1, 2, 61
+    %3:f8rc, %4:g8rc_and_g8rc_nox0 = LFSUX %0, killed %2
+    %5:vssrc = XSRSP killed %3
+    %22:vslrc = SUBREG_TO_REG 1, %5, %subreg.sub_64
+    %7:g8rc = LI8 8
+    %8:vssrc = XFLOADf32 %4, killed %7
+    %23:vslrc = SUBREG_TO_REG 1, %8, %subreg.sub_64
+    %10:vsrc = XXPERMDI %23, %22, 0
+    %11:vrrc = XVCVDPSP killed %10, implicit $rm
+    $v2 = COPY %11
+    BLR8 implicit $lr8, implicit $rm, implicit $v2
+...
+# CHECK-LABEL: remove_xsrsp
+# CHECK: sldi 4, 4, 2
+# CHECK-NEXT: lfsux 0, 3, 4
+# CHECK-NEXT: xsrsp 0, 0
+# CHECK-NEXT: lfs 1, 8(3)
+# CHECK-NEXT: xxmrghd	0, 1, 0
+# CHECK-NEXT: xvcvdpsp 34, 0
+# CHECK-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/pr45297.ll b/llvm/test/CodeGen/PowerPC/pr45297.ll
index 5bd5df543950..39583d5a04cc 100644
--- a/llvm/test/CodeGen/PowerPC/pr45297.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45297.ll
@@ -1,11 +1,20 @@
-; RUN: not --crash llc -verify-machineinstrs \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names \
 ; RUN:   -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec \
 ; RUN:   -mattr=-power8-vector -mattr=-vsx < %s 2>&1 | FileCheck %s
-; CHECK: LLVM ERROR: Cannot select: {{.*}}: ch = PPCISD::ST_VSR_SCAL_INT<(store 4 into @Global)>
 
 @Global = dso_local global i32 55, align 4
 
 define dso_local void @test(float %0) local_unnamed_addr {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fctiwz f0, f1
+; CHECK-NEXT:    addi r3, r1, -4
+; CHECK-NEXT:    addis r4, r2, Global@toc@ha
+; CHECK-NEXT:    stfiwx f0, 0, r3
+; CHECK-NEXT:    lwz r3, -4(r1)
+; CHECK-NEXT:    stw r3, Global@toc@l(r4)
+; CHECK-NEXT:    blr
 entry:
   %1 = fptosi float %0 to i32
   store i32 %1, i32* @Global, align 4
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index 43da05ebe7c7..f2691ba1a771 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -1628,6 +1628,7 @@ define void @cmpxchg_i32_monotonic_monotonic(i32* %ptr, i32 %cmp, i32 %val) noun
 ;
 ; RV64IA-LABEL: cmpxchg_i32_monotonic_monotonic:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB20_3
@@ -1680,6 +1681,7 @@ define void @cmpxchg_i32_acquire_monotonic(i32* %ptr, i32 %cmp, i32 %val) nounwi
 ;
 ; RV64IA-LABEL: cmpxchg_i32_acquire_monotonic:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w.aq a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB21_3
@@ -1732,6 +1734,7 @@ define void @cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %cmp, i32 %val) nounwind
 ;
 ; RV64IA-LABEL: cmpxchg_i32_acquire_acquire:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w.aq a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB22_3
@@ -1784,6 +1787,7 @@ define void @cmpxchg_i32_release_monotonic(i32* %ptr, i32 %cmp, i32 %val) nounwi
 ;
 ; RV64IA-LABEL: cmpxchg_i32_release_monotonic:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB23_3
@@ -1836,6 +1840,7 @@ define void @cmpxchg_i32_release_acquire(i32* %ptr, i32 %cmp, i32 %val) nounwind
 ;
 ; RV64IA-LABEL: cmpxchg_i32_release_acquire:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB24_3
@@ -1888,6 +1893,7 @@ define void @cmpxchg_i32_acq_rel_monotonic(i32* %ptr, i32 %cmp, i32 %val) nounwi
 ;
 ; RV64IA-LABEL: cmpxchg_i32_acq_rel_monotonic:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w.aq a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB25_3
@@ -1940,6 +1946,7 @@ define void @cmpxchg_i32_acq_rel_acquire(i32* %ptr, i32 %cmp, i32 %val) nounwind
 ;
 ; RV64IA-LABEL: cmpxchg_i32_acq_rel_acquire:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w.aq a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB26_3
@@ -1992,6 +1999,7 @@ define void @cmpxchg_i32_seq_cst_monotonic(i32* %ptr, i32 %cmp, i32 %val) nounwi
 ;
 ; RV64IA-LABEL: cmpxchg_i32_seq_cst_monotonic:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB27_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w.aqrl a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB27_3
@@ -2044,6 +2052,7 @@ define void @cmpxchg_i32_seq_cst_acquire(i32* %ptr, i32 %cmp, i32 %val) nounwind
 ;
 ; RV64IA-LABEL: cmpxchg_i32_seq_cst_acquire:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w.aqrl a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB28_3
@@ -2096,6 +2105,7 @@ define void @cmpxchg_i32_seq_cst_seq_cst(i32* %ptr, i32 %cmp, i32 %val) nounwind
 ;
 ; RV64IA-LABEL: cmpxchg_i32_seq_cst_seq_cst:
 ; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    sext.w a1, a1
 ; RV64IA-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w.aqrl a3, (a0)
 ; RV64IA-NEXT:    bne a3, a1, .LBB29_3
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
index bc94f8ba9187..70fc0e4ab1cb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll
@@ -383,11 +383,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vpnot_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: vpnot_v4i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vpt.s32 lt, q0, zr
+; CHECK-NEXT:    vpte.s32 lt, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 gt, q1, zr
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vcmpt.i32 eq, q2, zr
+; CHECK-NEXT:    vcmpe.i32 eq, q2, zr
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -400,3 +398,73 @@ entry:
   %s = select <4 x i1> %o, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %s
 }
+
+declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
+
+define arm_aapcs_vfpcc <4 x i32> @vpttet_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: vpttet_v4i1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vpttet.s32 ge, q0, q2
+; CHECK-NEXT:    vmaxt.s32 q3, q0, q1
+; CHECK-NEXT:    vcmpt.s32 gt, q0, zr
+; CHECK-NEXT:    vcmpe.s32 gt, q1, zr
+; CHECK-NEXT:    vmovt q3, q2
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp sge <4 x i32> %x, %z
+  %1 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %x, <4 x i32> %y, i32 0, <4 x i1> %0, <4 x i32> %z)
+  %2 = icmp sgt <4 x i32> %x, zeroinitializer
+  %3 = and <4 x i1> %0, %2
+  %4 = xor <4 x i1> %3, <i1 true, i1 true, i1 true, i1 true>
+  %5 = icmp sgt <4 x i32> %y, zeroinitializer
+  %6 = and <4 x i1> %5, %4
+  %7 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %6, <4 x i32> %1)
+  ret <4 x i32> %7
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vpttee_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: vpttee_v4i1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vpttee.s32 ge, q0, q2
+; CHECK-NEXT:    vmaxt.s32 q3, q0, q1
+; CHECK-NEXT:    vcmpt.s32 gt, q0, zr
+; CHECK-NEXT:    vmove q3, q2
+; CHECK-NEXT:    vmove q3, q2
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp sge <4 x i32> %x, %z
+  %1 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %x, <4 x i32> %y, i32 0, <4 x i1> %0, <4 x i32> %z)
+  %2 = icmp sgt <4 x i32> %x, zeroinitializer
+  %3 = and <4 x i1> %0, %2
+  %4 = xor <4 x i1> %3, <i1 true, i1 true, i1 true, i1 true>
+  %5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %4, <4 x i32> %1)
+  %6 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %4, <4 x i32> %5)
+  ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vpttte_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: vpttte_v4i1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vpttte.s32 ge, q0, q2
+; CHECK-NEXT:    vmaxt.s32 q3, q0, q1
+; CHECK-NEXT:    vcmpt.s32 gt, q0, zr
+; CHECK-NEXT:    vmovt q3, q2
+; CHECK-NEXT:    vmove q3, q2
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp sge <4 x i32> %x, %z
+  %1 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %x, <4 x i32> %y, i32 0, <4 x i1> %0, <4 x i32> %z)
+  %2 = icmp sgt <4 x i32> %x, zeroinitializer
+  %3 = and <4 x i1> %0, %2
+  %4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %3, <4 x i32> %1)
+  %5 = xor <4 x i1> %3, <i1 true, i1 true, i1 true, i1 true>
+  %6 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %5, <4 x i32> %4)
+  ret <4 x i32> %6
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
index ce82ba0909af..df211f1efebc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
@@ -19,11 +19,9 @@ define arm_aapcs_vfpcc void @thres_i32(i32* %data, i16 zeroext %N, i32 %T) {
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vpt.s32 ge, q1, r2
+; CHECK-NEXT:    vpte.s32 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.s32 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
+; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -77,11 +75,9 @@ define arm_aapcs_vfpcc void @thresh_i16(i16* %data, i16 zeroext %N, i16 signext
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
-; CHECK-NEXT:    vpt.s16 ge, q1, r2
+; CHECK-NEXT:    vpte.s16 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.s16 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r0], #16
+; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB1_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -135,11 +131,9 @@ define arm_aapcs_vfpcc void @thresh_i8(i8* %data, i16 zeroext %N, i8 signext %T)
 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
-; CHECK-NEXT:    vpt.s8 ge, q1, r2
+; CHECK-NEXT:    vpte.s8 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.s8 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrbt.8 q0, [r0], #16
+; CHECK-NEXT:    vstrbe.8 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB2_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -195,11 +189,9 @@ define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T)
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vpt.f32 ge, q1, r2
+; CHECK-NEXT:    vpte.f32 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.f32 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
+; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -255,11 +247,9 @@ define arm_aapcs_vfpcc void @thresh_f16(half* %data, i16 zeroext %N, float %T.co
 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
-; CHECK-NEXT:    vpt.f16 ge, q1, r2
+; CHECK-NEXT:    vpte.f16 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.f16 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r0], #16
+; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB4_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -318,11 +308,9 @@ define arm_aapcs_vfpcc void @thres_rev_i32(i32* %data, i16 zeroext %N, i32 %T) {
 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vpt.s32 ge, q1, r2
+; CHECK-NEXT:    vpte.s32 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.s32 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
+; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB5_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -376,11 +364,9 @@ define arm_aapcs_vfpcc void @thresh_rev_i16(i16* %data, i16 zeroext %N, i16 sign
 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
-; CHECK-NEXT:    vpt.s16 ge, q1, r2
+; CHECK-NEXT:    vpte.s16 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.s16 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r0], #16
+; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB6_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -434,11 +420,9 @@ define arm_aapcs_vfpcc void @thresh_rev_i8(i8* %data, i16 zeroext %N, i8 signext
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
-; CHECK-NEXT:    vpt.s8 ge, q1, r2
+; CHECK-NEXT:    vpte.s8 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.s8 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrbt.8 q0, [r0], #16
+; CHECK-NEXT:    vstrbe.8 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -494,11 +478,9 @@ define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vpt.f32 ge, q1, r2
+; CHECK-NEXT:    vpte.f32 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.f32 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
+; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -554,11 +536,9 @@ define arm_aapcs_vfpcc void @thresh_rev_f16(half* %data, i16 zeroext %N, float %
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
-; CHECK-NEXT:    vpt.f16 ge, q1, r2
+; CHECK-NEXT:    vpte.f16 ge, q1, r2
 ; CHECK-NEXT:    vcmpt.f16 le, q1, r1
-; CHECK-NEXT:    vpnot
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r0], #16
+; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB9_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
new file mode 100644
index 000000000000..c32abb24dd87
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
+
+define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32* nocapture %z, i32 %n) {
+; CHECK-LABEL: test32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r5, lr}
+; CHECK-NEXT:    push {r5, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    blt .LBB0_2
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vrev64.32 q1, q2
+; CHECK-NEXT:    vrev64.32 q4, q3
+; CHECK-NEXT:    vmov r12, s4
+; CHECK-NEXT:    vmov lr, s16
+; CHECK-NEXT:    smull r12, r5, lr, r12
+; CHECK-NEXT:    lsrl r12, r5, #31
+; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    vmov r12, s6
+; CHECK-NEXT:    vmov.32 q0[1], r5
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    smull r12, r5, r5, r12
+; CHECK-NEXT:    lsrl r12, r5, #31
+; CHECK-NEXT:    vmov.32 q0[2], r12
+; CHECK-NEXT:    vmov r12, s8
+; CHECK-NEXT:    vmov.32 q0[3], r5
+; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    smull r12, r5, r5, r12
+; CHECK-NEXT:    lsrl r12, r5, #31
+; CHECK-NEXT:    vmov.32 q1[0], r12
+; CHECK-NEXT:    vmov r12, s10
+; CHECK-NEXT:    vmov.32 q1[1], r5
+; CHECK-NEXT:    vmov r5, s14
+; CHECK-NEXT:    smull r12, r5, r5, r12
+; CHECK-NEXT:    lsrl r12, r5, #31
+; CHECK-NEXT:    vmov.32 q1[2], r12
+; CHECK-NEXT:    vmov.32 q1[3], r5
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  .LBB0_2: @ %for.cond.cleanup
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r5, pc}
+entry:
+  %0 = and i32 %n, 3
+  %cmp = icmp eq i32 %0, 0
+  %cmp113 = icmp sgt i32 %n, 0
+  br i1 %cmp113, label %vector.body, label %for.cond.cleanup
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
+  %1 = getelementptr inbounds i32, i32* %x, i32 %index
+  %2 = bitcast i32* %1 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %2, align 4
+  %3 = shufflevector <4 x i32> %wide.load, <4 x i32> %wide.load, <2 x i32> <i32 0, i32 2>
+  %4 = shufflevector <4 x i32> %wide.load, <4 x i32> %wide.load, <2 x i32> <i32 1, i32 3>
+  %5 = sext <2 x i32> %3 to <2 x i64>
+  %6 = sext <2 x i32> %4 to <2 x i64>
+  %7 = getelementptr inbounds i32, i32* %y, i32 %index
+  %8 = bitcast i32* %7 to <4 x i32>*
+  %wide.load15 = load <4 x i32>, <4 x i32>* %8, align 4
+  %9 = shufflevector <4 x i32> %wide.load15, <4 x i32> %wide.load15, <2 x i32> <i32 0, i32 2>
+  %10 = shufflevector <4 x i32> %wide.load15, <4 x i32> %wide.load15, <2 x i32> <i32 1, i32 3>
+  %11 = sext <2 x i32> %9 to <2 x i64>
+  %12 = sext <2 x i32> %10 to <2 x i64>
+  %13 = mul <2 x i64> %11, %5
+  %14 = mul <2 x i64> %12, %6
+  %15 = lshr <2 x i64> %13, <i64 31, i64 31>
+  %16 = lshr <2 x i64> %14, <i64 31, i64 31>
+  %17 = shufflevector <2 x i64> %15, <2 x i64> %16, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %18 = trunc <4 x i64> %17 to <4 x i32>
+  %19 = getelementptr inbounds i32, i32* %z, i32 %index
+  %20 = bitcast i32* %19 to <4 x i32>*
+  store <4 x i32> %18, <4 x i32>* %20, align 4
+  %index.next = add i32 %index, 4
+  %21 = icmp eq i32 %index.next, %n
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test16(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i16* nocapture %z, i32 %n) {
+; CHECK-LABEL: test16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
+; CHECK-NEXT:    vldrh.u16 q2, [r1], #16
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vmovlt.s16 q1, q0
+; CHECK-NEXT:    vmovlt.s16 q3, q2
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vmul.i32 q1, q3, q1
+; CHECK-NEXT:    vmul.i32 q0, q2, q0
+; CHECK-NEXT:    vshr.u32 q1, q1, #15
+; CHECK-NEXT:    vshr.u32 q0, q0, #15
+; CHECK-NEXT:    vmovnt.i32 q0, q1
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    bne .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  %cmp113 = icmp sgt i32 %n, 0
+  br i1 %cmp113, label %vector.body, label %for.cond.cleanup
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
+  %1 = getelementptr inbounds i16, i16* %x, i32 %index
+  %2 = bitcast i16* %1 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %2, align 2
+  %3 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %4 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %5 = sext <4 x i16> %3 to <4 x i32>
+  %6 = sext <4 x i16> %4 to <4 x i32>
+  %7 = getelementptr inbounds i16, i16* %y, i32 %index
+  %8 = bitcast i16* %7 to <8 x i16>*
+  %wide.load15 = load <8 x i16>, <8 x i16>* %8, align 2
+  %9 = shufflevector <8 x i16> %wide.load15, <8 x i16> %wide.load15, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %10 = shufflevector <8 x i16> %wide.load15, <8 x i16> %wide.load15, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %11 = sext <4 x i16> %9 to <4 x i32>
+  %12 = sext <4 x i16> %10 to <4 x i32>
+  %13 = mul <4 x i32> %11, %5
+  %14 = mul <4 x i32> %12, %6
+  %15 = lshr <4 x i32> %13, <i32 15, i32 15, i32 15, i32 15>
+  %16 = lshr <4 x i32> %14, <i32 15, i32 15, i32 15, i32 15>
+  %17 = shufflevector <4 x i32> %15, <4 x i32> %16, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %18 = trunc <8 x i32> %17 to <8 x i16>
+  %19 = getelementptr inbounds i16, i16* %z, i32 %index
+  %20 = bitcast i16* %19 to <8 x i16>*
+  store <8 x i16> %18, <8 x i16>* %20, align 2
+  %index.next = add i32 %index, 8
+  %21 = icmp eq i32 %index.next, %n
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test8(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i8* nocapture %z, i32 %n) {
+; CHECK-LABEL: test8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrb.u8 q2, [r1], #16
+; CHECK-NEXT:    subs r3, #16
+; CHECK-NEXT:    vmovlt.u8 q1, q0
+; CHECK-NEXT:    vmovlt.u8 q3, q2
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vmovlb.u8 q2, q2
+; CHECK-NEXT:    vmul.i16 q1, q3, q1
+; CHECK-NEXT:    vmul.i16 q0, q2, q0
+; CHECK-NEXT:    vshr.u16 q1, q1, #7
+; CHECK-NEXT:    vshr.u16 q0, q0, #7
+; CHECK-NEXT:    vmovnt.i16 q0, q1
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    bne .LBB2_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 15
+  %cmp = icmp eq i32 %0, 0
+  %cmp117 = icmp sgt i32 %n, 0
+  br i1 %cmp117, label %vector.body, label %for.cond.cleanup
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
+  %1 = getelementptr inbounds i8, i8* %x, i32 %index
+  %2 = bitcast i8* %1 to <16 x i8>*
+  %wide.load = load <16 x i8>, <16 x i8>* %2, align 1
+  %3 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %4 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %5 = zext <8 x i8> %3 to <8 x i16>
+  %6 = zext <8 x i8> %4 to <8 x i16>
+  %7 = getelementptr inbounds i8, i8* %y, i32 %index
+  %8 = bitcast i8* %7 to <16 x i8>*
+  %wide.load19 = load <16 x i8>, <16 x i8>* %8, align 1
+  %9 = shufflevector <16 x i8> %wide.load19, <16 x i8> %wide.load19, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %10 = shufflevector <16 x i8> %wide.load19, <16 x i8> %wide.load19, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %11 = zext <8 x i8> %9 to <8 x i16>
+  %12 = zext <8 x i8> %10 to <8 x i16>
+  %13 = mul <8 x i16> %11, %5
+  %14 = mul <8 x i16> %12, %6
+  %15 = lshr <8 x i16> %13, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %16 = lshr <8 x i16> %14, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %17 = shufflevector <8 x i16> %15, <8 x i16> %16, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %18 = trunc <16 x i16> %17 to <16 x i8>
+  %19 = getelementptr inbounds i8, i8* %z, i32 %index
+  %20 = bitcast i8* %19 to <16 x i8>*
+  store <16 x i8> %18, <16 x i8>* %20, align 1
+  %index.next = add i32 %index, 16
+  %21 = icmp eq i32 %index.next, %n
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
index 347c4870ab6a..7401d771d1b6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
@@ -68,14 +68,10 @@ body:             |
     ; CHECK: liveins: $q0, $q1, $q2, $r0
     ; CHECK: $vpr = VMSR_P0 killed $r0, 14 /* CC::al */, $noreg
     ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3
-    ; CHECK: BUNDLE implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $vpr, implicit killed $q1, implicit $q2, implicit killed $q3 {
-    ; CHECK:   MVE_VPST 8, implicit $vpr
+    ; CHECK: BUNDLE implicit-def dead $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $vpr, implicit-def $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit killed $vpr, implicit killed $q1, implicit $q2, implicit killed $q3 {
+    ; CHECK:   MVE_VPST 12, implicit $vpr
     ; CHECK:   renamable $q3 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q1, renamable $q2, 1, renamable $vpr, killed renamable $q3
-    ; CHECK: }
-    ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
-    ; CHECK: BUNDLE implicit-def $q1, implicit-def $d2, implicit-def $s4, implicit-def $s5, implicit-def $d3, implicit-def $s6, implicit-def $s7, implicit $vpr, implicit killed $q3, implicit undef $q1 {
-    ; CHECK:   MVE_VPST 8, implicit $vpr
-    ; CHECK:   renamable $q1 = nnan ninf nsz MVE_VMINNMf32 killed renamable $q3, renamable $q3, 1, renamable $vpr, undef renamable $q1
+    ; CHECK:   renamable $q1 = nnan ninf nsz MVE_VMINNMf32 internal killed renamable $q3, internal renamable $q3, 2, internal renamable $vpr, undef renamable $q1
     ; CHECK: }
     ; CHECK: $q3 = MVE_VORR $q0, $q0, 0, $noreg, undef $q3
     ; CHECK: BUNDLE implicit-def dead $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $s1, implicit-def $d1, implicit-def $s2, implicit-def $s3, implicit killed $vpr, implicit killed $q1, implicit killed $q2, implicit killed $q3, implicit killed $q0 {
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
new file mode 100644
index 000000000000..765d3a4de831
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
@@ -0,0 +1,231 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-arm-none-eabi"
+
+  define hidden arm_aapcs_vfpcc <4 x float> @vpt_block_else(<4 x float> %inactive1, <4 x float> %inactive2, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+  entry:
+    %conv.i = zext i16 %p to i32
+    %0 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> undef, <4 x float> %a, <4 x float> %b, i32 %conv.i) #2
+    %1 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> undef, <4 x float> %0, <4 x float> %0, i32 %conv.i) #2
+    %2 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive1, <4 x float> %1, <4 x float> %b, i32 %conv.i) #2
+    %3 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive2, <4 x float> %2, <4 x float> %b, i32 %conv.i) #2
+    ret <4 x float> %3
+  }
+
+  declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
+
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind readnone }
+  attributes #2 = { nounwind }
+
+...
+---
+name:            vpt_block_else
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$q0', virtual-reg: '' }
+  - { reg: '$q1', virtual-reg: '' }
+  - { reg: '$q2', virtual-reg: '' }
+  - { reg: '$q3', virtual-reg: '' }
+  - { reg: '$r0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+constants:       []
+body:             |
+  bb.0.entry:
+    liveins: $q0, $q1, $q2
+
+    ; CHECK-LABEL: name: vpt_block_else
+    ; CHECK: liveins: $q0, $q1, $q2
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit killed $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
+    ; CHECK:   MVE_VPTv4s32 5, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q0, $zr, 12, 1, internal killed renamable $vpr
+    ; CHECK:   renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR $q3, $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit killed $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
+    ; CHECK:   MVE_VPTv4s32 7, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q0, $zr, 12, 1, internal killed renamable $vpr
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR $q3, $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
+    ; CHECK:   MVE_VPTv4s32 13, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal renamable $q3
+    ; CHECK:   renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal killed renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
+    ; CHECK:   MVE_VPTv4s32 9, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal renamable $vpr, internal renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal killed renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3 {
+    ; CHECK:   MVE_VPTv4s32 15, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal killed renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3 {
+    ; CHECK:   MVE_VPTv4s32 14, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal killed renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
+    ; CHECK:   MVE_VPTv4s32 10, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal killed renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3 {
+    ; CHECK:   MVE_VPTv4s32 6, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal renamable $vpr, internal killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal killed renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+    ; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    ; CHECK: BUNDLE implicit-def $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit killed $q3 {
+    ; CHECK:   MVE_VPTv4s32 11, renamable $q0, renamable $q2, 10, implicit-def $vpr
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal renamable $vpr, killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal renamable $vpr, internal killed renamable $q3
+    ; CHECK:   renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal killed renamable $q3
+    ; CHECK: }
+    ; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+    ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $q0
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VCMPs32r killed renamable $q0, $zr, 12, 1, killed renamable $vpr
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, renamable $q3
+    $q0 = MVE_VORR $q3, $q3, 0, $noreg, undef $q0
+
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VCMPs32r killed renamable $q0, $zr, 12, 1, killed renamable $vpr
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, renamable $q3
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, renamable $q3
+    $q0 = MVE_VORR $q3, $q3, 0, $noreg, undef $q0
+
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, renamable $q3
+    renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, killed renamable $q3
+    $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, renamable $q3
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, killed renamable $q3
+    $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, renamable $q3
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, renamable $q3
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, killed renamable $q3
+    $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, renamable $q3
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, killed renamable $q3
+    $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, killed renamable $q3
+    $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q3
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, killed renamable $vpr, killed renamable $q3
+    $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+
+    $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
+    renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
+    renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
+    $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
+
+    tBX_RET 14, $noreg, implicit $q0
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-nots.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-nots.mir
index b3e953964b19..8bc7a0b53598 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-nots.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-nots.mir
@@ -61,14 +61,10 @@ body:             |
 
     ; CHECK-LABEL: name: vpnot
     ; CHECK: liveins: $q0, $q1, $q2
-    ; CHECK: BUNDLE implicit-def $vpr, implicit $q0, implicit $zr, implicit $q1 {
-    ; CHECK:   MVE_VPTv4s32r 8, renamable $q0, $zr, 11, implicit-def $vpr
+    ; CHECK: BUNDLE implicit-def $vpr, implicit $q0, implicit $zr, implicit $q1, implicit killed $q2 {
+    ; CHECK:   MVE_VPTv4s32r 12, renamable $q0, $zr, 11, implicit-def $vpr
     ; CHECK:   renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, internal killed renamable $vpr
-    ; CHECK: }
-    ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
-    ; CHECK: BUNDLE implicit-def $vpr, implicit killed $vpr, implicit killed $q2, implicit $zr {
-    ; CHECK:   MVE_VPST 8, implicit $vpr
-    ; CHECK:   renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr
+    ; CHECK:   renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 2, internal killed renamable $vpr
     ; CHECK: }
     ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr
     ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $q0
@@ -244,14 +240,10 @@ body:             |
     ; CHECK: liveins: $q0, $q1, $q2
     ; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q0, $zr, 11, 0, $noreg
     ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
-    ; CHECK: BUNDLE implicit-def $vpr, implicit killed $vpr, implicit $q1, implicit $zr {
-    ; CHECK:   MVE_VPST 8, implicit $vpr
+    ; CHECK: BUNDLE implicit-def $vpr, implicit killed $vpr, implicit $q1, implicit $zr, implicit killed $q2 {
+    ; CHECK:   MVE_VPST 12, implicit $vpr
     ; CHECK:   renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 1, killed renamable $vpr
-    ; CHECK: }
-    ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
-    ; CHECK: BUNDLE implicit-def $vpr, implicit killed $vpr, implicit killed $q2, implicit $zr {
-    ; CHECK:   MVE_VPST 8, implicit $vpr
-    ; CHECK:   renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 1, killed renamable $vpr
+    ; CHECK:   renamable $vpr = MVE_VCMPi32r killed renamable $q2, $zr, 0, 2, internal killed renamable $vpr
     ; CHECK: }
     ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
     ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
index efdb2c6d684c..a4d8537343ca 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
@@ -729,18 +729,98 @@ terminate:                                        ; preds = %entry
   unreachable
 }
 
+%class.MyClass = type { i32 }
+
+; This crashed on debug mode (= when NDEBUG is not defined) when the logic for
+; computing the innermost region was not correct, in which a loop region
+; contains an exception region. This should pass CFGSort without crashing.
+define void @test12() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  %e = alloca %class.MyClass, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 9
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  invoke void @quux(i32 %i.0)
+          to label %for.inc unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %for.body
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* bitcast ({ i8*, i8* }* @_ZTI7MyClass to i8*)]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast ({ i8*, i8* }* @_ZTI7MyClass to i8*)) #3
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %rethrow
+
+catch:                                            ; preds = %catch.start
+  %5 = call i8* @__cxa_get_exception_ptr(i8* %2) #3 [ "funclet"(token %1) ]
+  %6 = bitcast i8* %5 to %class.MyClass*
+  %call = call %class.MyClass* @_ZN7MyClassC2ERKS_(%class.MyClass* %e, %class.MyClass* dereferenceable(4) %6) [ "funclet"(token %1) ]
+  %7 = call i8* @__cxa_begin_catch(i8* %2) #3 [ "funclet"(token %1) ]
+  %x = getelementptr inbounds %class.MyClass, %class.MyClass* %e, i32 0, i32 0
+  %8 = load i32, i32* %x, align 4
+  invoke void @quux(i32 %8) [ "funclet"(token %1) ]
+          to label %invoke.cont2 unwind label %ehcleanup
+
+invoke.cont2:                                     ; preds = %catch
+  %call3 = call %class.MyClass* @_ZN7MyClassD2Ev(%class.MyClass* %e) #3 [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %for.inc
+
+rethrow:                                          ; preds = %catch.start
+  call void @llvm.wasm.rethrow.in.catch() #6 [ "funclet"(token %1) ]
+  unreachable
+
+for.inc:                                          ; preds = %invoke.cont2, %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+ehcleanup:                                        ; preds = %catch
+  %9 = cleanuppad within %1 []
+  %call4 = call %class.MyClass* @_ZN7MyClassD2Ev(%class.MyClass* %e) #3 [ "funclet"(token %9) ]
+  invoke void @__cxa_end_catch() [ "funclet"(token %9) ]
+          to label %invoke.cont6 unwind label %terminate7
+
+invoke.cont6:                                     ; preds = %ehcleanup
+  cleanupret from %9 unwind to caller
+
+for.end:                                          ; preds = %for.cond
+  ret void
+
+terminate7:                                       ; preds = %ehcleanup
+  %10 = cleanuppad within %9 []
+  %11 = call i8* @llvm.wasm.get.exception(token %10)
+  call void @__clang_call_terminate(i8* %11) #7 [ "funclet"(token %10) ]
+  unreachable
+}
+
 ; Check if the unwind destination mismatch stats are correct
-; NOSORT-STAT: 11 wasm-cfg-stackify    - Number of EH pad unwind mismatches found
+; NOSORT-STAT: 14 wasm-cfg-stackify    - Number of EH pad unwind mismatches found
 
 declare void @foo()
 declare void @bar()
 declare i32 @baz()
+declare void @quux(i32)
 declare void @fun(i32)
 ; Function Attrs: nounwind
 declare void @nothrow(i32) #0
 declare i32 @nothrow_i32() #0
+
 ; Function Attrs: nounwind
 declare %class.Object* @_ZN6ObjectD2Ev(%class.Object* returned) #0
+@_ZTI7MyClass = external constant { i8*, i8* }, align 4
+; Function Attrs: nounwind
+declare %class.MyClass* @_ZN7MyClassD2Ev(%class.MyClass* returned) #0
+; Function Attrs: nounwind
+declare %class.MyClass* @_ZN7MyClassC2ERKS_(%class.MyClass* returned, %class.MyClass* dereferenceable(4)) #0
+
 declare i32 @__gxx_wasm_personality_v0(...)
 declare i8* @llvm.wasm.get.exception(token)
 declare i32 @llvm.wasm.get.ehselector(token)
@@ -748,6 +828,7 @@ declare void @llvm.wasm.rethrow.in.catch()
 declare i32 @llvm.eh.typeid.for(i8*)
 declare i8* @__cxa_begin_catch(i8*)
 declare void @__cxa_end_catch()
+declare i8* @__cxa_get_exception_ptr(i8*)
 declare void @__clang_call_terminate(i8*)
 declare void @_ZSt9terminatev()
 ; Function Attrs: nounwind
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
index cad6c4ac855b..ad42cc5f8615 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -308,6 +308,10 @@ attributes #0 = { returns_twice }
 attributes #1 = { noreturn }
 attributes #2 = { nounwind }
 attributes #3 = { allocsize(0) }
+; CHECK: attributes #{{[0-9]+}} = { "wasm-import-module"="env" "wasm-import-name"="__invoke_void" }
+; CHECK: attributes #{{[0-9]+}} = { "wasm-import-module"="env" "wasm-import-name"="__cxa_find_matching_catch_3" }
+; CHECK: attributes #{{[0-9]+}} = { "wasm-import-module"="env" "wasm-import-name"="__invoke_i8*_i32_%struct.__jmp_buf_tag*" }
+; CHECK: attributes #{{[0-9]+}} = { "wasm-import-module"="env" "wasm-import-name"="__invoke_void_%struct.__jmp_buf_tag*_i32" }
 ; CHECK: attributes #[[ALLOCSIZE_ATTR]] = { allocsize(1) }
 
 !llvm.dbg.cu = !{!2}
diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index afb4acd736d3..e12a4152ac7e 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -9,10 +9,8 @@ define i32 @ptestz_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vptest %xmm1, %xmm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = xor <2 x i64> %c, <i64 -1, i64 -1>
   %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %d)
@@ -25,11 +23,8 @@ define i32 @ptestz_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vptest %ymm1, %ymm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -47,10 +42,8 @@ define i32 @ptestz_128_invert1(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_128_invert1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vptest %xmm1, %xmm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    vptest %xmm0, %xmm1
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = xor <2 x i64> %d, <i64 -1, i64 -1>
   %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %t1)
@@ -63,11 +56,8 @@ define i32 @ptestz_256_invert1(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_256_invert1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vptest %ymm1, %ymm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    vptest %ymm0, %ymm1
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t1 = xor <4 x i64> %d, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -85,10 +75,8 @@ define i32 @ptestc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestc_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vptest %xmm1, %xmm0
-; CHECK-NEXT:    cmovael %esi, %eax
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = xor <2 x i64> %c, <i64 -1, i64 -1>
   %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %d)
@@ -101,11 +89,8 @@ define i32 @ptestc_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestc_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vptest %ymm1, %ymm0
-; CHECK-NEXT:    cmovael %esi, %eax
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -123,10 +108,8 @@ define i32 @ptestnzc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestnzc_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vptest %xmm1, %xmm0
-; CHECK-NEXT:    cmovael %esi, %eax
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = xor <2 x i64> %c, <i64 -1, i64 -1>
   %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %d)
@@ -139,9 +122,6 @@ define i32 @ptestnzc_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestnzc_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vptest %ymm1, %ymm0
 ; CHECK-NEXT:    cmovbel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
@@ -153,6 +133,21 @@ define i32 @ptestnzc_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
   ret i32 %t4
 }
 
+define i32 @ptestnzc_256_invert0_commute(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
+; CHECK-LABEL: ptestnzc_256_invert0_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    vptest %ymm1, %ymm0
+; CHECK-NEXT:    cmoval %esi, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %t2 = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %t1, <4 x i64> %d)
+  %t3 = icmp eq i32 %t2, 0
+  %t4 = select i1 %t3, i32 %a, i32 %b
+  ret i32 %t4
+}
+
 ;
 ; testz(-1,X) -> testz(X,X)
 ;
@@ -161,8 +156,7 @@ define i32 @ptestz_128_allones0(<2 x i64> %c, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_128_allones0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vptest %xmm0, %xmm1
+; CHECK-NEXT:    vptest %xmm0, %xmm0
 ; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> <i64 -1, i64 -1>, <2 x i64> %c)
@@ -175,9 +169,7 @@ define i32 @ptestz_256_allones0(<4 x i64> %c, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_256_allones0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vptest %ymm0, %ymm1
+; CHECK-NEXT:    vptest %ymm0, %ymm0
 ; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -195,8 +187,7 @@ define i32 @ptestz_128_allones1(<2 x i64> %c, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_128_allones1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vptest %xmm1, %xmm0
+; CHECK-NEXT:    vptest %xmm0, %xmm0
 ; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> <i64 -1, i64 -1>)
@@ -209,9 +200,7 @@ define i32 @ptestz_256_allones1(<4 x i64> %c, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_256_allones1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vptest %ymm1, %ymm0
+; CHECK-NEXT:    vptest %ymm0, %ymm0
 ; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -226,10 +215,8 @@ define zeroext i1 @PR38522(<16 x i8>* %x, <16 x i8>* %y) {
 ; CHECK:       # %bb.0: # %start
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
 ; CHECK-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vptest %xmm1, %xmm0
-; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    vptest %xmm0, %xmm0
+; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
 start:
   %0 = load <16 x i8>, <16 x i8>* %x, align 16
diff --git a/llvm/test/CodeGen/X86/combine-testpd.ll b/llvm/test/CodeGen/X86/combine-testpd.ll
index b43ac2a2ea0d..9ae3d80e59cd 100644
--- a/llvm/test/CodeGen/X86/combine-testpd.ll
+++ b/llvm/test/CodeGen/X86/combine-testpd.ll
@@ -9,10 +9,8 @@ define i32 @testpdz_128_invert0(<2 x double> %c, <2 x double> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpdz_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = bitcast <2 x double> %c to <2 x i64>
   %t1 = xor <2 x i64> %t0, <i64 -1, i64 -1>
@@ -27,11 +25,8 @@ define i32 @testpdz_256_invert0(<4 x double> %c, <4 x double> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpdz_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t0 = bitcast <4 x double> %c to <4 x i64>
@@ -51,10 +46,8 @@ define i32 @testpdz_128_invert1(<2 x double> %c, <2 x double> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpdz_128_invert1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vtestpd %xmm1, %xmm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    vtestpd %xmm0, %xmm1
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = bitcast <2 x double> %d to <2 x i64>
   %t1 = xor <2 x i64> %t0, <i64 -1, i64 -1>
@@ -69,11 +62,8 @@ define i32 @testpdz_256_invert1(<4 x double> %c, <4 x double> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpdz_256_invert1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vtestpd %ymm1, %ymm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    vtestpd %ymm0, %ymm1
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t0 = bitcast <4 x double> %d to <4 x i64>
@@ -93,10 +83,8 @@ define i32 @testpdc_128_invert0(<2 x double> %c, <2 x double> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpdc_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
-; CHECK-NEXT:    cmovael %esi, %eax
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = bitcast <2 x double> %c to <2 x i64>
   %t1 = xor <2 x i64> %t0, <i64 -1, i64 -1>
@@ -111,11 +99,8 @@ define i32 @testpdc_256_invert0(<4 x double> %c, <4 x double> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpdc_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
-; CHECK-NEXT:    cmovael %esi, %eax
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t0 = bitcast <4 x double> %c to <4 x i64>
@@ -135,8 +120,6 @@ define i32 @testpdnzc_128_invert0(<2 x double> %c, <2 x double> %d, i32 %a, i32
 ; CHECK-LABEL: testpdnzc_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
 ; CHECK-NEXT:    cmovbel %esi, %eax
 ; CHECK-NEXT:    retq
@@ -153,9 +136,6 @@ define i32 @testpdnzc_256_invert0(<4 x double> %c, <4 x double> %d, i32 %a, i32
 ; CHECK-LABEL: testpdnzc_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
 ; CHECK-NEXT:    cmovbel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll
index 6b5e65322c64..f3605441348d 100644
--- a/llvm/test/CodeGen/X86/combine-testps.ll
+++ b/llvm/test/CodeGen/X86/combine-testps.ll
@@ -9,10 +9,8 @@ define i32 @testpsz_128_invert0(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b)
 ; CHECK-LABEL: testpsz_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = bitcast <4 x float> %c to <2 x i64>
   %t1 = xor <2 x i64> %t0, <i64 -1, i64 -1>
@@ -27,11 +25,8 @@ define i32 @testpsz_256_invert0(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b)
 ; CHECK-LABEL: testpsz_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t0 = bitcast <8 x float> %c to <4 x i64>
@@ -51,10 +46,8 @@ define i32 @testpsz_128_invert1(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b)
 ; CHECK-LABEL: testpsz_128_invert1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vtestps %xmm1, %xmm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    vtestps %xmm0, %xmm1
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = bitcast <4 x float> %d to <2 x i64>
   %t1 = xor <2 x i64> %t0, <i64 -1, i64 -1>
@@ -69,11 +62,8 @@ define i32 @testpsz_256_invert1(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b)
 ; CHECK-LABEL: testpsz_256_invert1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vtestps %ymm1, %ymm0
-; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    vtestps %ymm0, %ymm1
+; CHECK-NEXT:    cmovael %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t0 = bitcast <8 x float> %d to <4 x i64>
@@ -93,10 +83,8 @@ define i32 @testpsc_128_invert0(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b)
 ; CHECK-LABEL: testpsc_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
-; CHECK-NEXT:    cmovael %esi, %eax
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = bitcast <4 x float> %c to <2 x i64>
   %t1 = xor <2 x i64> %t0, <i64 -1, i64 -1>
@@ -111,11 +99,8 @@ define i32 @testpsc_256_invert0(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b)
 ; CHECK-LABEL: testpsc_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
-; CHECK-NEXT:    cmovael %esi, %eax
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %t0 = bitcast <8 x float> %c to <4 x i64>
@@ -135,8 +120,6 @@ define i32 @testpsnzc_128_invert0(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpsnzc_128_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
 ; CHECK-NEXT:    cmovbel %esi, %eax
 ; CHECK-NEXT:    retq
@@ -153,9 +136,6 @@ define i32 @testpsnzc_256_invert0(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b
 ; CHECK-LABEL: testpsnzc_256_invert0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
 ; CHECK-NEXT:    cmovbel %esi, %eax
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll
index d2d646248616..fbaf2d0f0914 100644
--- a/llvm/test/CodeGen/X86/rotate_vec.ll
+++ b/llvm/test/CodeGen/X86/rotate_vec.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,XOP
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,XOP,XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) {
@@ -77,10 +78,20 @@ define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) {
 }
 
 define <4 x i32> @rot_v4i32_zero_non_splat(<4 x i32> %x) {
-; CHECK-LABEL: rot_v4i32_zero_non_splat:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; XOPAVX1-LABEL: rot_v4i32_zero_non_splat:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: rot_v4i32_zero_non_splat:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vbroadcastss %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: rot_v4i32_zero_non_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 3>)
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %2
@@ -97,12 +108,19 @@ define <4 x i32> @rot_v4i32_allsignbits(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i32> @rot_v4i32_mask_ashr0(<4 x i32> %a0) {
-; XOP-LABEL: rot_v4i32_mask_ashr0:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT:    vprotd $1, %xmm0, %xmm0
-; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: rot_v4i32_mask_ashr0:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vprotd $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: rot_v4i32_mask_ashr0:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vprotd $1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: rot_v4i32_mask_ashr0:
 ; AVX512:       # %bb.0:
@@ -118,13 +136,21 @@ define <4 x i32> @rot_v4i32_mask_ashr0(<4 x i32> %a0) {
 }
 
 define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
-; XOP-LABEL: rot_v4i32_mask_ashr1:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vpsrad $25, %xmm0, %xmm0
-; XOP-NEXT:    vprotd $1, %xmm0, %xmm0
-; XOP-NEXT:    vpbroadcastd %xmm0, %xmm0
-; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: rot_v4i32_mask_ashr1:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrad $25, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vprotd $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: rot_v4i32_mask_ashr1:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrad $25, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vprotd $1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: rot_v4i32_mask_ashr1:
 ; AVX512:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/shuffle-combine-crash-3.ll b/llvm/test/CodeGen/X86/shuffle-combine-crash-3.ll
new file mode 100644
index 000000000000..7ddc0f1b2e4a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shuffle-combine-crash-3.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Verify that we don't crash when compiling this. We used to hit an
+; assert like this
+;
+;   llc: ../include/llvm/CodeGen/ValueTypes.h:251: llvm::MVT llvm::EVT::getSimpleVT() const: Assertion `isSimple() && "Expected a SimpleValueType!"' failed.
+;
+; due to getFauxShuffleMask not checking that the VT was simple before a call
+; to getSimpleValueType().
+
+define i1 @dont_hit_assert(i24 signext %d) {
+; CHECK-LABEL: dont_hit_assert:
+; CHECK:       # %bb.0: # %for.cond
+; CHECK-NEXT:    movb $-1, %al
+; CHECK-NEXT:    negb %al
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+for.cond:
+  %t0 = insertelement <8 x i24> zeroinitializer, i24 1, i32 0
+  %t5 = icmp slt <8 x i24> %t0, zeroinitializer
+  %t7 = icmp slt i24 0, %d
+  %rdx.shuf = shufflevector <8 x i1> %t5, <8 x i1> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = and <8 x i1> %t5, %rdx.shuf
+  %rdx.shuf22 = shufflevector <8 x i1> %bin.rdx, <8 x i1> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx23 = and <8 x i1> %bin.rdx, %rdx.shuf22
+  %rdx.shuf24 = shufflevector <8 x i1> %bin.rdx23, <8 x i1> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx25 = and <8 x i1> %bin.rdx23, %rdx.shuf24
+  %t8 = extractelement <8 x i1> %bin.rdx25, i32 0
+  ret i1 %t8
+}
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 7be19c07da80..4483de105385 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -9,17 +9,30 @@ define float @sqrt_ieee(float %f) #0 {
   ; CHECK:   liveins: $xmm0
   ; CHECK:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
   ; CHECK:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
+  ; CHECK:   %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr
+  ; CHECK:   $xmm0 = COPY %1
+  ; CHECK:   RET 0, $xmm0
+  %call = tail call float @llvm.sqrt.f32(float %f)
+  ret float %call
+}
+
+define float @sqrt_ieee_ninf(float %f) #0 {
+  ; CHECK-LABEL: name: sqrt_ieee_ninf
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK:   liveins: $xmm0
+  ; CHECK:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
+  ; CHECK:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
   ; CHECK:   [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]]
-  ; CHECK:   %3:fr32 = nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
+  ; CHECK:   %3:fr32 = ninf nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
   ; CHECK:   [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   %5:fr32 = nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK:   %5:fr32 = ninf nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr
   ; CHECK:   [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   %7:fr32 = nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK:   %8:fr32 = nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr
-  ; CHECK:   %9:fr32 = nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr
-  ; CHECK:   %10:fr32 = nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr
-  ; CHECK:   %11:fr32 = nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK:   %12:fr32 = nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr
+  ; CHECK:   %7:fr32 = ninf nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK:   %8:fr32 = ninf nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr
+  ; CHECK:   %9:fr32 = ninf nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr
+  ; CHECK:   %10:fr32 = ninf nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK:   %11:fr32 = ninf nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK:   %12:fr32 = ninf nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr
   ; CHECK:   [[COPY1:%[0-9]+]]:vr128 = COPY %12
   ; CHECK:   [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]]
   ; CHECK:   [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load 4 from constant-pool)
@@ -31,7 +44,7 @@ define float @sqrt_ieee(float %f) #0 {
   ; CHECK:   [[COPY5:%[0-9]+]]:fr32 = COPY [[VPANDNrr]]
   ; CHECK:   $xmm0 = COPY [[COPY5]]
   ; CHECK:   RET 0, $xmm0
-  %call = tail call float @llvm.sqrt.f32(float %f)
+  %call = tail call ninf float @llvm.sqrt.f32(float %f)
   ret float %call
 }
 
@@ -41,17 +54,30 @@ define float @sqrt_daz(float %f) #1 {
   ; CHECK:   liveins: $xmm0
   ; CHECK:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
   ; CHECK:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
+  ; CHECK:   %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr
+  ; CHECK:   $xmm0 = COPY %1
+  ; CHECK:   RET 0, $xmm0
+  %call = tail call float @llvm.sqrt.f32(float %f)
+  ret float %call
+}
+
+define float @sqrt_daz_ninf(float %f) #1 {
+  ; CHECK-LABEL: name: sqrt_daz_ninf
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK:   liveins: $xmm0
+  ; CHECK:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
+  ; CHECK:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
   ; CHECK:   [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]]
-  ; CHECK:   %3:fr32 = nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
+  ; CHECK:   %3:fr32 = ninf nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
   ; CHECK:   [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   %5:fr32 = nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK:   %5:fr32 = ninf nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr
   ; CHECK:   [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   %7:fr32 = nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK:   %8:fr32 = nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr
-  ; CHECK:   %9:fr32 = nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr
-  ; CHECK:   %10:fr32 = nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr
-  ; CHECK:   %11:fr32 = nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK:   %12:fr32 = nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr
+  ; CHECK:   %7:fr32 = ninf nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK:   %8:fr32 = ninf nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr
+  ; CHECK:   %9:fr32 = ninf nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr
+  ; CHECK:   %10:fr32 = ninf nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK:   %11:fr32 = ninf nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK:   %12:fr32 = ninf nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr
   ; CHECK:   [[COPY1:%[0-9]+]]:vr128 = COPY %12
   ; CHECK:   [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS
   ; CHECK:   %15:fr32 = nofpexcept VCMPSSrr [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr
@@ -60,7 +86,7 @@ define float @sqrt_daz(float %f) #1 {
   ; CHECK:   [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDNrr]]
   ; CHECK:   $xmm0 = COPY [[COPY3]]
   ; CHECK:   RET 0, $xmm0
-  %call = tail call float @llvm.sqrt.f32(float %f)
+  %call = tail call ninf float @llvm.sqrt.f32(float %f)
   ret float %call
 }
 
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index f10199ce958f..b2593bc43578 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -59,6 +59,20 @@ define float @finite_f32_no_estimate(float %f) #0 {
 define float @finite_f32_estimate_ieee(float %f) #1 {
 ; SSE-LABEL: finite_f32_estimate_ieee:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: finite_f32_estimate_ieee:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %call = tail call float @__sqrtf_finite(float %f) #2
+  ret float %call
+}
+
+define float @finite_f32_estimate_ieee_ninf(float %f) #1 {
+; SSE-LABEL: finite_f32_estimate_ieee_ninf:
+; SSE:       # %bb.0:
 ; SSE-NEXT:    rsqrtss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm1, %xmm2
@@ -72,7 +86,7 @@ define float @finite_f32_estimate_ieee(float %f) #1 {
 ; SSE-NEXT:    andnps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: finite_f32_estimate_ieee:
+; AVX1-LABEL: finite_f32_estimate_ieee_ninf:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm2
@@ -85,7 +99,7 @@ define float @finite_f32_estimate_ieee(float %f) #1 {
 ; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
-; AVX512-LABEL: finite_f32_estimate_ieee:
+; AVX512-LABEL: finite_f32_estimate_ieee_ninf:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm2
@@ -99,13 +113,27 @@ define float @finite_f32_estimate_ieee(float %f) #1 {
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %call = tail call float @__sqrtf_finite(float %f) #2
+  %call = tail call ninf float @__sqrtf_finite(float %f) #2
   ret float %call
 }
 
 define float @finite_f32_estimate_daz(float %f) #4 {
 ; SSE-LABEL: finite_f32_estimate_daz:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: finite_f32_estimate_daz:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %call = tail call float @__sqrtf_finite(float %f) #2
+  ret float %call
+}
+
+define float @finite_f32_estimate_daz_ninf(float %f) #4 {
+; SSE-LABEL: finite_f32_estimate_daz_ninf:
+; SSE:       # %bb.0:
 ; SSE-NEXT:    rsqrtss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm1, %xmm2
@@ -119,7 +147,7 @@ define float @finite_f32_estimate_daz(float %f) #4 {
 ; SSE-NEXT:    andnps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: finite_f32_estimate_daz:
+; AVX1-LABEL: finite_f32_estimate_daz_ninf:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm2
@@ -132,7 +160,7 @@ define float @finite_f32_estimate_daz(float %f) #4 {
 ; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
-; AVX512-LABEL: finite_f32_estimate_daz:
+; AVX512-LABEL: finite_f32_estimate_daz_ninf:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm2
@@ -144,7 +172,7 @@ define float @finite_f32_estimate_daz(float %f) #4 {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %call = tail call float @__sqrtf_finite(float %f) #2
+  %call = tail call ninf float @__sqrtf_finite(float %f) #2
   ret float %call
 }
 
@@ -175,6 +203,20 @@ define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
 define float @sqrtf_check_denorms(float %x) #3 {
 ; SSE-LABEL: sqrtf_check_denorms:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtf_check_denorms:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %call = tail call float @__sqrtf_finite(float %x) #2
+  ret float %call
+}
+
+define float @sqrtf_check_denorms_ninf(float %x) #3 {
+; SSE-LABEL: sqrtf_check_denorms_ninf:
+; SSE:       # %bb.0:
 ; SSE-NEXT:    rsqrtss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm1, %xmm2
@@ -188,7 +230,7 @@ define float @sqrtf_check_denorms(float %x) #3 {
 ; SSE-NEXT:    andnps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: sqrtf_check_denorms:
+; AVX1-LABEL: sqrtf_check_denorms_ninf:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm2
@@ -201,7 +243,7 @@ define float @sqrtf_check_denorms(float %x) #3 {
 ; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
-; AVX512-LABEL: sqrtf_check_denorms:
+; AVX512-LABEL: sqrtf_check_denorms_ninf:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm2
@@ -215,13 +257,27 @@ define float @sqrtf_check_denorms(float %x) #3 {
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %call = tail call float @__sqrtf_finite(float %x) #2
+  %call = tail call ninf float @__sqrtf_finite(float %x) #2
   ret float %call
 }
 
 define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; SSE-LABEL: sqrt_v4f32_check_denorms:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrt_v4f32_check_denorms:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
+  ret <4 x float> %call
+}
+
+define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 {
+; SSE-LABEL: sqrt_v4f32_check_denorms_ninf:
+; SSE:       # %bb.0:
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
@@ -237,7 +293,7 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: sqrt_v4f32_check_denorms:
+; AVX1-LABEL: sqrt_v4f32_check_denorms_ninf:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm2
@@ -251,7 +307,7 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
-; AVX512-LABEL: sqrt_v4f32_check_denorms:
+; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm2
@@ -266,7 +322,7 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; AVX512-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
 ; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
+  %call = tail call ninf <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
   ret <4 x float> %call
 }
 
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 8c12c0d2e9fc..7db96f42b1cb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -6914,6 +6914,102 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
   ret <16 x i16> %4
 }
 
+define <16 x i16> @shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30(<8 x i32> %a0, <8 x i32> %a1) {
+; AVX1-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrad $25, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $25, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $25, %xmm1, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $25, %xmm1, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpsrad $25, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpsrad $25, %ymm1, %ymm1
+; AVX2OR512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrad $25, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $25, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $25, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrad $25, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrad $25, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsrad $25, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %1 = ashr <8 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = ashr <8 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <8 x i32> %1 to <16 x i16>
+  %4 = bitcast <8 x i32> %2 to <16 x i16>
+  %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i16> %5
+}
+
+define <16 x i16> @shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30(<8 x i32> %a0, <8 x i32> %a1) {
+; AVX1-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $25, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $25, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $25, %xmm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrld $25, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpsrld $25, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpsrld $25, %ymm1, %ymm1
+; AVX2OR512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrld $25, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsrld $25, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrld $25, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrld $25, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrld $25, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsrld $25, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %1 = lshr <8 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = lshr <8 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <8 x i32> %1 to <16 x i16>
+  %4 = bitcast <8 x i32> %2 to <16 x i16>
+  %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i16> %5
+}
+
 define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) {
 ; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 0e79116884f4..2601c7d4172d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -216,6 +216,58 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a
   ret <32 x i16> %shuffle
 }
 
+define <32 x i16> @shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; KNL-LABEL: shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpsrad $25, %zmm0, %zmm0
+; KNL-NEXT:    vpsrad $25, %zmm1, %zmm1
+; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; KNL-NEXT:    vpackssdw %ymm3, %ymm2, %ymm2
+; KNL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsrad $25, %zmm0, %zmm0
+; SKX-NEXT:    vpsrad $25, %zmm1, %zmm1
+; SKX-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <32 x i16>
+  %4 = bitcast <16 x i32> %2 to <32 x i16>
+  %5 = shufflevector <32 x i16> %3, <32 x i16> %4, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 32, i32 34, i32 36, i32 38, i32 8, i32 10, i32 12, i32 14, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 48, i32 50, i32 52, i32 54, i32 24, i32 26, i32 28, i32 30, i32 56, i32 58, i32 60, i32 62>
+  ret <32 x i16> %5
+}
+
+define <32 x i16> @shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; KNL-LABEL: shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpsrld $25, %zmm0, %zmm0
+; KNL-NEXT:    vpsrld $25, %zmm1, %zmm1
+; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; KNL-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
+; KNL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsrld $25, %zmm0, %zmm0
+; SKX-NEXT:    vpsrld $25, %zmm1, %zmm1
+; SKX-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <32 x i16>
+  %4 = bitcast <16 x i32> %2 to <32 x i16>
+  %5 = shufflevector <32 x i16> %3, <32 x i16> %4, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 32, i32 34, i32 36, i32 38, i32 8, i32 10, i32 12, i32 14, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 48, i32 50, i32 52, i32 54, i32 24, i32 26, i32 28, i32 30, i32 56, i32 58, i32 60, i32 62>
+  ret <32 x i16> %5
+}
+
 define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
 ; KNL-LABEL: insert_dup_mem_v32i16_i32:
 ; KNL:       ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 3afb54a9d3bb..3c95f4ce400e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -546,6 +546,94 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
   ret <64 x i8> %shuffle
 }
 
+define <64 x i8> @shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrad $25, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpackssdw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrad $25, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsrad $25, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT:    vpackssdw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vpsrad $25, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    retq
+  %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <64 x i8>
+  %4 = bitcast <16 x i32> %2 to <64 x i8>
+  %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32  0, i32  1, i32  4, i32  5, i32  8, i32  9, i32 12, i32 13, i32  64, i32  65, i32  68, i32  69, i32  72, i32  73, i32  76, i32  77, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32  80, i32  81, i32  84, i32  85, i32  88, i32  89, i32  92, i32  93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32  96, i32  97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125>
+  ret <64 x i8> %5
+}
+
+define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrld $25, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrld $25, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsrld $25, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vpsrld $25, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    retq
+  %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <64 x i8>
+  %4 = bitcast <16 x i32> %2 to <64 x i8>
+  %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32  0, i32  1, i32  4, i32  5, i32  8, i32  9, i32 12, i32 13, i32  64, i32  65, i32  68, i32  69, i32  72, i32  73, i32  76, i32  77, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32  80, i32  81, i32  84, i32  85, i32  88, i32  89, i32  92, i32  93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32  96, i32  97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125>
+  ret <64 x i8> %5
+}
+
 define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
 ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
 ; AVX512F:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 08923cab6ebb..17781eb922a6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -159,6 +159,36 @@ define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
   ret <32 x i16> %1
 }
 
+define <32 x i16> @combine_vpermi2var_as_packssdw(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; CHECK-LABEL: combine_vpermi2var_as_packssdw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrad $25, %zmm0, %zmm0
+; CHECK-NEXT:    vpsrad $25, %zmm1, %zmm1
+; CHECK-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <32 x i16>
+  %4 = bitcast <16 x i32> %2 to <32 x i16>
+  %5 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %3, <32 x i16> <i16 0, i16 2, i16 4, i16 6, i16 32, i16 34, i16 36, i16 38, i16 8, i16 10, i16 12, i16 14, i16 40, i16 42, i16 44, i16 46, i16 16, i16 18, i16 20, i16 22, i16 48, i16 50, i16 52, i16 54, i16 24, i16 26, i16 28, i16 30, i16 56, i16 58, i16 60, i16 62>, <32 x i16> %4, i32 -1)
+  ret <32 x i16> %5
+}
+
+define <32 x i16> @combine_vpermi2var_as_packusdw(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; CHECK-LABEL: combine_vpermi2var_as_packusdw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrld $25, %zmm0, %zmm0
+; CHECK-NEXT:    vpsrld $25, %zmm1, %zmm1
+; CHECK-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <32 x i16>
+  %4 = bitcast <16 x i32> %2 to <32 x i16>
+  %5 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %3, <32 x i16> <i16 0, i16 2, i16 4, i16 6, i16 32, i16 34, i16 36, i16 38, i16 8, i16 10, i16 12, i16 14, i16 40, i16 42, i16 44, i16 46, i16 16, i16 18, i16 20, i16 22, i16 48, i16 50, i16 52, i16 54, i16 24, i16 26, i16 28, i16 30, i16 56, i16 58, i16 60, i16 62>, <32 x i16> %4, i32 -1)
+  ret <32 x i16> %5
+}
+
 define <64 x i8> @combine_pshufb_as_packsswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; CHECK-LABEL: combine_pshufb_as_packsswb:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/ExecutionEngine/OrcLazy/Inputs/bar-return-i32-call-foo.ll b/llvm/test/ExecutionEngine/OrcLazy/Inputs/bar-return-i32-call-foo.ll
new file mode 100644
index 000000000000..5a36041640ce
--- /dev/null
+++ b/llvm/test/ExecutionEngine/OrcLazy/Inputs/bar-return-i32-call-foo.ll
@@ -0,0 +1,8 @@
+declare i32 @foo()
+
+define i32 @bar() {
+entry:
+  %0 = call i32 @foo()
+  ret i32 %0
+}
+
diff --git a/llvm/test/ExecutionEngine/OrcLazy/Inputs/basic-object-source.ll b/llvm/test/ExecutionEngine/OrcLazy/Inputs/foo-return-i32-0.ll
similarity index 100%
rename from llvm/test/ExecutionEngine/OrcLazy/Inputs/basic-object-source.ll
rename to llvm/test/ExecutionEngine/OrcLazy/Inputs/foo-return-i32-0.ll
diff --git a/llvm/test/ExecutionEngine/OrcLazy/basic-object-file-loading.ll b/llvm/test/ExecutionEngine/OrcLazy/basic-object-file-loading.ll
index 0d815782b1cb..9dc74d5241bb 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/basic-object-file-loading.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/basic-object-file-loading.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj -o %t %p/Inputs/basic-object-source.ll
+; RUN: llc -filetype=obj -o %t %p/Inputs/foo-return-i32-0.ll
 ; RUN: lli -jit-kind=orc-lazy -extra-object %t %s
 ;
 ; Check that we can load an object file and call a function in it.
diff --git a/llvm/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll b/llvm/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
index 00b54fbf73fd..67d392e71456 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
@@ -1,6 +1,12 @@
-; RUN: lli -jit-kind=orc-lazy -orc-lazy-debug=funcs-to-stdout %s | FileCheck %s
+; Test that global constructors and destructors are run:
 ;
-; Test that global constructors and destructors are run.
+; RUN: lli -jit-kind=orc-lazy -orc-lazy-debug=funcs-to-stdout -extra-module %s \
+; RUN:   %S/Inputs/noop-main.ll | FileCheck %s
+;
+; Test that this is true for global constructors and destructors in other
+; JITDylibs.
+; RUN: lli -jit-kind=orc-lazy -orc-lazy-debug=funcs-to-stdout \
+; RUN:   -jd extra -extra-module %s -jd main %S/Inputs/noop-main.ll | FileCheck %s
 ;
 ; CHECK: Hello
 ; CHECK: [ {{.*}}main{{.*}} ]
@@ -22,11 +28,6 @@ entry:
 
 declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)
 
-define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
-entry:
-  ret i32 0
-}
-
 define internal void @_GLOBAL__sub_I_hello.cpp() {
 entry:
   %puts.i.i.i = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0))
diff --git a/llvm/test/ExecutionEngine/OrcLazy/static-library-support.ll b/llvm/test/ExecutionEngine/OrcLazy/static-library-support.ll
index 304160c7f787..a13441187aa8 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/static-library-support.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/static-library-support.ll
@@ -1,11 +1,13 @@
 ; This first line will generate the .o files for the next run line
-; RUN: llc -filetype=obj -o %t.o %p/Inputs/basic-object-source.ll
-; RUN: llvm-ar r %t.a %t.o
-; RUN: lli -jit-kind=orc-lazy -extra-archive %t.a %s
+; RUN: rm -rf %t && mkdir -p %t
+; RUN: llc -filetype=obj -o %t/foo.o %p/Inputs/foo-return-i32-0.ll
+; RUN: llc -filetype=obj -o %t/bar.o %p/Inputs/bar-return-i32-call-foo.ll
+; RUN: llvm-ar r %t/staticlib.a %t/foo.o %t/bar.o
+; RUN: lli -jit-kind=orc-lazy -extra-archive %t/staticlib.a %s
 
-declare i32 @foo()
+declare i32 @bar()
 
 define i32 @main() {
-  %r = call i32 @foo( )   ; <i32> [#uses=1]
+  %r = call i32 @bar()   ; <i32> [#uses=1]
   ret i32 %r
 }
diff --git a/llvm/test/FileCheck/numeric-defines-diagnostics.txt b/llvm/test/FileCheck/numeric-defines-diagnostics.txt
index da8c9cc8884e..ddced3721f1d 100644
--- a/llvm/test/FileCheck/numeric-defines-diagnostics.txt
+++ b/llvm/test/FileCheck/numeric-defines-diagnostics.txt
@@ -28,6 +28,7 @@ NUMERRCLITRAIL-NEXT:Global define #1: #VALUE+2=10 (parsed as: {{\[\[#VALUE\+2:10
      NUMERRCLITRAIL-NEXT:                                                  ^
 
 ; Invalid format for variable.
+RUN: %ProtectFileCheckOutput \
 RUN: not FileCheck -D#,VALUE=10 --input-file %s %s 2>&1 \
 RUN:   | FileCheck %s --strict-whitespace --match-full-lines --check-prefix NUMERRCLIFMT
 
diff --git a/llvm/test/FileCheck/numeric-expression.txt b/llvm/test/FileCheck/numeric-expression.txt
index 14e378281793..8396b6e2de11 100644
--- a/llvm/test/FileCheck/numeric-expression.txt
+++ b/llvm/test/FileCheck/numeric-expression.txt
@@ -41,8 +41,10 @@ CHECK-LABEL: DEF FMT SPC
 CHECK-NEXT: [[# %x , VAR2a : ]]
 
 ; Numeric variable definition with unsupported matching format.
+RUN: %ProtectFileCheckOutput \
 RUN: not FileCheck --check-prefixes ERR,INVALID-FMT-SPEC1 --input-file %s %s 2>&1 \
 RUN:   | FileCheck --check-prefix INVALID-FMT-SPEC-MSG1 --strict-whitespace %s
+RUN: %ProtectFileCheckOutput \
 RUN: not FileCheck --check-prefixes ERR,INVALID-FMT-SPEC2 --input-file %s %s 2>&1 \
 RUN:   | FileCheck --check-prefix INVALID-FMT-SPEC-MSG2 --strict-whitespace %s
 
@@ -182,6 +184,7 @@ CHECK-NEXT: [[# %u, VAR2]]
 CHECK-NEXT: [[# %u, VAR3]]
 
 ; Conflicting implicit format.
+RUN: %ProtectFileCheckOutput \
 RUN: not FileCheck --check-prefixes CHECK,FMT-CONFLICT --input-file %s %s 2>&1 \
 RUN:   | FileCheck --strict-whitespace --check-prefix FMT-CONFLICT-MSG %s
 
@@ -359,6 +362,7 @@ SAME-LINE-USE-MSG2-NEXT: {{S}}AME-LINE-USE2-NEXT: {{\[\[#VAR2:VAR1\+1\]\] \[\[#V
 SAME-LINE-USE-MSG2-NEXT:        {{^}}                                         ^{{$}}
 
 ; Invalid change of format in variable redefinition.
+RUN: %ProtectFileCheckOutput \
 RUN: not FileCheck --check-prefix REDEF-NEW-FMT --input-file %s %s 2>&1 \
 RUN:   | FileCheck --strict-whitespace --check-prefix REDEF-NEW-FMT-MSG %s
 
diff --git a/llvm/test/MC/PowerPC/ppc64-prefix-align.s b/llvm/test/MC/PowerPC/ppc64-prefix-align.s
index 80d2f0722a76..29594e9e33ae 100644
--- a/llvm/test/MC/PowerPC/ppc64-prefix-align.s
+++ b/llvm/test/MC/PowerPC/ppc64-prefix-align.s
@@ -13,10 +13,10 @@
 
 beq 0, LAB1               # 4
 beq 1, LAB2               # 8
-# CHECK-BE:       0: 41 82 00 c0        bt 2, .+192
-# CHECK-BE-NEXT:  4: 41 86 00 f8        bt 6, .+248
-# CHECK-LE:       0: c0 00 82 41        bt 2, .+192
-# CHECK-LE-NEXT:  4: f8 00 86 41        bt 6, .+248
+# CHECK-BE:       0: 41 82 00 c0        bt 2, 0xc0
+# CHECK-BE-NEXT:  4: 41 86 00 f8        bt 6, 0xfc
+# CHECK-LE:       0: c0 00 82 41        bt 2, 0xc0
+# CHECK-LE-NEXT:  4: f8 00 86 41        bt 6, 0xfc
 paddi 1, 2, 8589934576, 0 # 16
 paddi 1, 2, 8589934576, 0 # 24
 paddi 1, 2, 8589934576, 0 # 32
diff --git a/llvm/test/MC/RISCV/attribute-with-option.s b/llvm/test/MC/RISCV/attribute-with-option.s
new file mode 100644
index 000000000000..749717cb7a55
--- /dev/null
+++ b/llvm/test/MC/RISCV/attribute-with-option.s
@@ -0,0 +1,21 @@
+## When a user specifies an architecture extension which conflicts with an
+## architecture attribute, we use the architecture attribute instead of the
+## command line option.
+##
+## This test uses option '-mattr=+e' to specify the "e" extension. However,
+## there is an architecture attribute in the file to specify rv32i. We will
+## use rv32i to assemble the file instead of rv32e.
+
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+e -filetype=obj -o - \
+# RUN:   | llvm-readobj -A - | FileCheck %s
+
+.attribute arch, "rv32i2p0"
+## Invalid operand for RV32E, because x16 is an invalid register for RV32E.
+## Use RV32I to assemble, since it will not trigger an assembly error.
+lui x16, 1
+
+## Check that the architecture attribute is not overridden by the command line
+## option.
+# CHECK:      Tag: 5
+# CHECK-NEXT: TagName: arch
+# CHECK-NEXT: Value: rv32i2p0
diff --git a/llvm/test/MC/X86/align-via-padding-corner.s b/llvm/test/MC/X86/align-via-padding-corner.s
new file mode 100644
index 000000000000..cc13ff7eed7f
--- /dev/null
+++ b/llvm/test/MC/X86/align-via-padding-corner.s
@@ -0,0 +1,29 @@
+  # RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s -x86-pad-max-prefix-size=5 | llvm-objdump -d - | FileCheck %s
+
+
+  # The first test check the correctness cornercase - can't add prefixes on a
+  # instruction following by a prefix.
+  .globl labeled_prefix_test
+labeled_prefix_test:
+# CHECK:       0: 2e 2e 2e 2e 2e e9 06 00 00 00    jmp
+# CHECK:       a: 3e e9 00 00 00 00                jmp
+  jmp bar
+  DS
+  jmp bar
+  .p2align 4
+bar:
+  ret
+
+  # The second test is similar to the second test - can't add prefixes on a
+  # instruction following by hardcode.
+  .p2align 5
+  .globl labeled_hardcode_test
+labeled_hardcode_test:
+# CHECK:      20: 2e 2e 2e 2e 2e e9 06 00 00 00    jmp
+# CHECK:      2a: 3e e9 00 00 00 00                jmp
+  jmp baz
+  .byte 0x3e
+  jmp baz
+  .p2align 4
+baz:
+  ret
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index 8dc9b945d589..369a661a89cf 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -76,6 +76,26 @@ define void @test3a(i8* %p) {
   ret void
 }
 
+declare noalias i8* @aligned_alloc(i64, i64)
+
+define void @test3b(i8* %p) {
+  %1 = tail call noalias i8* @aligned_alloc(i64 32, i64 128)
+  ; CHECK: %1 = alloca i8, i64 128, align 32
+  ; CHECK-NEXT: tail call void @nofree_arg_only
+  tail call void @nofree_arg_only(i8* %1, i8* %p)
+  ; CHECK-NOT: @free(i8* %1)
+  tail call void @free(i8* %1)
+  ret void
+}
+
+; leave alone non-constant alignments.
+define void @test3c(i64 %alignment) {
+  %1 = tail call noalias i8* @aligned_alloc(i64 %alignment, i64 128)
+  ; CHECK: tail call noalias i8* @aligned_alloc
+  tail call void @free(i8* %1)
+  ret void
+}
+
 declare noalias i8* @calloc(i64, i64)
 
 define void @test0() {
@@ -90,7 +110,7 @@ define void @test0() {
   ret void
 }
 
-; TEST 4 
+; TEST 4
 define void @test4() {
   %1 = tail call noalias i8* @malloc(i64 4)
   ; CHECK: %1 = alloca i8, i64 4
@@ -219,7 +239,7 @@ define i32 @test_lifetime() {
   ret i32 %3
 }
 
-; TEST 11 
+; TEST 11
 
 define void @test11() {
   %1 = tail call noalias i8* @malloc(i64 4)
diff --git a/llvm/test/Transforms/Attributor/range.ll b/llvm/test/Transforms/Attributor/range.ll
index 1e93e97634b2..ef8cd5234326 100644
--- a/llvm/test/Transforms/Attributor/range.ll
+++ b/llvm/test/Transforms/Attributor/range.ll
@@ -1276,13 +1276,11 @@ define i32 @ret1or2(i1 %c) {
 define i1 @callee_range_1(i1 %c1, i1 %c2, i1 %c3) {
 ; OLD_PM-LABEL: define {{[^@]+}}@callee_range_1
 ; OLD_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]])
-; OLD_PM-NEXT:    [[F:%.*]] = and i1 true, true
-; OLD_PM-NEXT:    ret i1 [[F]]
+; OLD_PM-NEXT:    ret i1 true
 ;
 ; NEW_PM-LABEL: define {{[^@]+}}@callee_range_1
 ; NEW_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]])
-; NEW_PM-NEXT:    [[F:%.*]] = and i1 true, true
-; NEW_PM-NEXT:    ret i1 [[F]]
+; NEW_PM-NEXT:    ret i1 true
 ;
 ; CGSCC_OLD_PM-LABEL: define {{[^@]+}}@callee_range_1
 ; CGSCC_OLD_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]])
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
index 6315e3bd74da..634c37568ea3 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -457,7 +457,7 @@ define i1 @test14_slt(i32 %a) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, false
-; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK-NEXT:    ret i1 false
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -508,7 +508,7 @@ define i1 @test14_sgt(i32 %a) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, false
-; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK-NEXT:    ret i1 false
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -585,7 +585,7 @@ define i1 @test14_ugt(i32 %a) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, false
-; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK-NEXT:    ret i1 false
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -629,6 +629,31 @@ else:
   ret i1 false
 }
 
+define i1 @test14_ugt_and(i32 %a) {
+; CHECK-LABEL: @test14_ugt_and(
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[RESULT:%.*]] = and i1 false, false
+; CHECK-NEXT:    ret i1 false
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+  %a.off = add i32 %a, -8
+  %cmp = icmp ugt i32 %a.off, 8
+  br i1 %cmp, label %then, label %else
+
+then:
+  %dead.1 = icmp eq i32 %a, 8
+  %dead.2 = icmp eq i32 %a, 16
+  %result = and i1 %dead.1, %dead.2
+  ret i1 %result
+
+else:
+  ret i1 false
+}
+
 @limit = external global i32
 define i1 @test15(i32 %a) {
 ; CHECK-LABEL: @test15(
diff --git a/llvm/test/Transforms/GVN/malloc-load-removal.ll b/llvm/test/Transforms/GVN/malloc-load-removal.ll
index 1d7a2ddc4c2d..84f4746344ca 100644
--- a/llvm/test/Transforms/GVN/malloc-load-removal.ll
+++ b/llvm/test/Transforms/GVN/malloc-load-removal.ll
@@ -54,3 +54,28 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK_NO_LIBCALLS: load
 ; CHECK_NO_LIBCALLS: icmp
 }
+
+declare i8* @aligned_alloc(i64, i64) nounwind
+
+define noalias i8* @test3() nounwind uwtable ssp {
+entry:
+  %call = tail call i8* @aligned_alloc(i64 256, i64 32) nounwind
+  %0 = load i8, i8* %call, align 32
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i8 0, i8* %call, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i8* %call
+
+; CHECK-LABEL: @test3(
+; CHECK-NOT: load
+; CHECK-NOT: icmp
+
+; CHECK_NO_LIBCALLS-LABEL: @test3(
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: icmp
+}
diff --git a/llvm/test/Transforms/Inline/ret_attr_update.ll b/llvm/test/Transforms/Inline/ret_attr_update.ll
deleted file mode 100644
index 2e53540c3fe2..000000000000
--- a/llvm/test/Transforms/Inline/ret_attr_update.ll
+++ /dev/null
@@ -1,159 +0,0 @@
-; RUN: opt < %s -inline-threshold=0 -always-inline -S | FileCheck %s
-; RUN: opt < %s -passes=always-inline -S | FileCheck %s
-
-declare i8* @foo(i8*) argmemonly nounwind
-
-define i8* @callee(i8 *%p) alwaysinline {
-; CHECK: @callee(
-; CHECK: call i8* @foo(i8* noalias %p)
-  %r = call i8* @foo(i8* noalias %p)
-  ret i8* %r
-}
-
-define i8* @caller(i8* %ptr, i64 %x) {
-; CHECK-LABEL: @caller
-; CHECK: call nonnull i8* @foo(i8* noalias
-  %gep = getelementptr inbounds i8, i8* %ptr, i64 %x
-  %p = call nonnull i8* @callee(i8* %gep)
-  ret i8* %p
-}
-
-declare void @llvm.experimental.guard(i1,...)
-; Cannot add nonnull attribute to foo
-; because the guard is a throwing call
-define internal i8* @callee_with_throwable(i8* %p) alwaysinline {
-; CHECK-NOT: callee_with_throwable
-  %r = call i8* @foo(i8* %p)
-  %cond = icmp ne i8* %r, null
-  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
-  ret i8* %r
-}
-
-declare i8* @bar(i8*) readonly nounwind
-; Here also we cannot add nonnull attribute to the call bar.
-define internal i8* @callee_with_explicit_control_flow(i8* %p) alwaysinline {
-; CHECK-NOT: callee_with_explicit_control_flow
-  %r = call i8* @bar(i8* %p)
-  %cond = icmp ne i8* %r, null
-  br i1 %cond, label %ret, label %orig
-
-ret:
-  ret i8* %r
-
-orig:
-  ret i8* %p
-}
-
-define i8* @caller2(i8* %ptr, i64 %x, i1 %cond) {
-; CHECK-LABEL: @caller2
-; CHECK: call i8* @foo
-; CHECK: call i8* @bar
-  %gep = getelementptr inbounds i8, i8* %ptr, i64 %x
-  %p = call nonnull i8* @callee_with_throwable(i8* %gep)
-  %q = call nonnull i8* @callee_with_explicit_control_flow(i8* %gep)
-  br i1 %cond, label %pret, label %qret
-
-pret:
-  ret i8* %p
-
-qret:
-  ret i8* %q
-}
-
-define internal i8* @callee3(i8 *%p) alwaysinline {
-; CHECK-NOT: callee3
-  %r = call noalias i8* @foo(i8* %p)
-  ret i8* %r
-}
-
-; add the deref attribute to the existing attributes on foo.
-define i8* @caller3(i8* %ptr, i64 %x) {
-; CHECK-LABEL: caller3
-; CHECK: call noalias dereferenceable_or_null(12) i8* @foo
-  %gep = getelementptr inbounds i8, i8* %ptr, i64 %x
-  %p = call dereferenceable_or_null(12) i8* @callee3(i8* %gep)
-  ret i8* %p
-}
-
-declare i8* @inf_loop_call(i8*) nounwind
-; We cannot propagate attributes to foo because we do not know whether inf_loop_call
-; will return execution.
-define internal i8* @callee_with_sideeffect_callsite(i8* %p) alwaysinline {
-; CHECK-NOT: callee_with_sideeffect_callsite
-  %r = call i8* @foo(i8* %p)
-  %v = call i8* @inf_loop_call(i8* %p)
-  ret i8* %r
-}
-
-; do not add deref attribute to foo
-define i8* @test4(i8* %ptr, i64 %x) {
-; CHECK-LABEL: test4
-; CHECK: call i8* @foo
-  %gep = getelementptr inbounds i8, i8* %ptr, i64 %x
-  %p = call dereferenceable_or_null(12) i8* @callee_with_sideeffect_callsite(i8* %gep)
-  ret i8* %p
-}
-
-declare i8* @baz(i8*) nounwind readonly
-define internal i8* @callee5(i8* %p) alwaysinline {
-; CHECK-NOT: callee5
-  %r = call i8* @foo(i8* %p)
-  %v = call i8* @baz(i8* %p)
-  ret i8* %r
-}
-
-; add the deref attribute to foo.
-define i8* @test5(i8* %ptr, i64 %x) {
-; CHECK-LABEL: test5
-; CHECK: call dereferenceable_or_null(12) i8* @foo
-  %gep = getelementptr inbounds i8, i8* %ptr, i64 %x
-  %s = call dereferenceable_or_null(12) i8* @callee5(i8* %gep)
-  ret i8* %s
-}
-
-; deref attributes have different values on the callee and the call feeding into
-; the return.
-; AttrBuilder chooses the already existing value and does not overwrite it.
-define internal i8* @callee6(i8* %p) alwaysinline {
-; CHECK-NOT: callee6
-  %r = call dereferenceable_or_null(16) i8* @foo(i8* %p)
-  %v = call i8* @baz(i8* %p)
-  ret i8* %r
-}
-
-
-define i8* @test6(i8* %ptr, i64 %x) {
-; CHECK-LABEL: test6
-; CHECK: call dereferenceable_or_null(16) i8* @foo
-  %gep = getelementptr inbounds i8, i8* %ptr, i64 %x
-  %s = call dereferenceable_or_null(12) i8* @callee6(i8* %gep)
-  ret i8* %s
-}
-
-; We add the attributes from the callee to both the calls below.
-define internal i8* @callee7(i8 *%ptr, i1 %cond) alwaysinline {
-; CHECK-NOT: @callee7(
-  br i1 %cond, label %pass, label %fail
-
-pass:
-  %r = call i8* @foo(i8* noalias %ptr)
-  ret i8* %r
-
-fail:
-  %s = call i8* @baz(i8* %ptr)
-  ret i8* %s
-}
-
-define void @test7(i8* %ptr, i64 %x, i1 %cond) {
-; CHECK-LABEL: @test7
-; CHECK: call nonnull i8* @foo(i8* noalias
-; CHECK: call nonnull i8* @baz
-; CHECK: phi i8*
-; CHECK: call void @snort
-
-  %gep = getelementptr inbounds i8, i8* %ptr, i64 %x
-  %t = call nonnull i8* @callee7(i8* %gep, i1 %cond)
-  call void @snort(i8* %t)
-  ret void
-}
-declare void @snort(i8*)
diff --git a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll
index 01084312c9f8..897242e4837b 100644
--- a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll
+++ b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll
@@ -159,8 +159,8 @@ define i8 @n8(i8 %x, i1 %y, i8 %z) {
 ; x - (y - z) -> x - y + z -> x + (z - y)
 define i8 @t9(i8 %x, i8 %y) {
 ; CHECK-LABEL: @t9(
-; CHECK-NEXT:    [[T01:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    ret i8 [[T01]]
+; CHECK-NEXT:    [[T0_NEG:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[T0_NEG]]
 ;
   %t0 = sub i8 %y, %x
   %t1 = sub i8 0, %t0
@@ -375,3 +375,51 @@ define i8 @n21(i8 %x, i16 %y) {
   %t2 = sub i8 %x, %t1
   ret i8 %t2
 }
+
+define i4 @negate_xor(i4 %x) {
+; CHECK-LABEL: @negate_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i4 [[X:%.*]], -6
+; CHECK-NEXT:    [[O_NEG:%.*]] = add i4 [[TMP1]], 1
+; CHECK-NEXT:    ret i4 [[O_NEG]]
+;
+  %o = xor i4 %x, 5
+  %r = sub i4 0, %o
+  ret i4 %r
+}
+
+define <2 x i4> @negate_xor_vec(<2 x i4> %x) {
+; CHECK-LABEL: @negate_xor_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i4> [[X:%.*]], <i4 -6, i4 5>
+; CHECK-NEXT:    [[O_NEG:%.*]] = add <2 x i4> [[TMP1]], <i4 1, i4 1>
+; CHECK-NEXT:    ret <2 x i4> [[O_NEG]]
+;
+  %o = xor <2 x i4> %x, <i4 5, i4 10>
+  %r = sub <2 x i4> zeroinitializer, %o
+  ret <2 x i4> %r
+}
+
+define i8 @negate_xor_use(i8 %x) {
+; CHECK-LABEL: @negate_xor_use(
+; CHECK-NEXT:    [[O:%.*]] = xor i8 [[X:%.*]], 5
+; CHECK-NEXT:    call void @use8(i8 [[O]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 0, [[O]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %o = xor i8 %x, 5
+  call void @use8(i8 %o)
+  %r = sub i8 0, %o
+  ret i8 %r
+}
+
+define i4 @negate_shl_xor(i4 %x, i4 %y) {
+; CHECK-LABEL: @negate_shl_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i4 [[X:%.*]], -6
+; CHECK-NEXT:    [[O_NEG:%.*]] = add i4 [[TMP1]], 1
+; CHECK-NEXT:    [[S_NEG:%.*]] = shl i4 [[O_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret i4 [[S_NEG]]
+;
+  %o = xor i4 %x, 5
+  %s = shl i4 %o, %y
+  %r = sub i4 0, %s
+  ret i4 %r
+}
diff --git a/llvm/test/Transforms/JumpThreading/select-unfold-msan.ll b/llvm/test/Transforms/JumpThreading/select-unfold-msan.ll
new file mode 100644
index 000000000000..ea336e0f0f7e
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/select-unfold-msan.ll
@@ -0,0 +1,28 @@
+; PR45220
+; RUN: opt -S -jump-threading < %s | FileCheck %s
+
+declare i1 @NOP()
+
+define dso_local i32 @f(i1 %b, i1 %u) sanitize_memory {
+entry:
+  br i1 %b, label %if.end, label %if.else
+
+if.else:
+  %call = call i1 @NOP()
+  br label %if.end
+
+if.end:
+; Check that both selects in this BB are still in place,
+; and were not replaced with a conditional branch.
+; CHECK:      phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: select
+; CHECK-NEXT: select
+; CHECK-NEXT: ret
+  %u1 = phi i1 [ true, %if.else ], [ %u, %entry ]
+  %v = phi i1 [ %call, %if.else ], [ false, %entry ]
+  %s = select i1 %u1, i32 22, i32 0
+  %v1 = select i1 %v, i32 %s, i32 42
+  ret i32 %v1
+}
+
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index 8302ad9562f5..250a51a27b44 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -111,7 +111,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(
 }
 
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
-; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
+; CHECK: store <2 x i32> <i32 1082130432, i32 123>, <2 x i32> addrspace(1)*
 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
index 1cb8d14f1778..3f24f5c9e53d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
@@ -55,7 +55,7 @@ entry:
 }
 
 ; CHECK-LABEL: @ext_ptr
-; CHECK  load <2 x i32>
+; CHECK: load <2 x i32>
 define void @ext_ptr(i32 addrspace(5)* %p) {
 entry:
   %gep1 = getelementptr inbounds i32, i32 addrspace(5)* %p, i64 0
@@ -68,7 +68,7 @@ entry:
 }
 
 ; CHECK-LABEL: @shrink_ptr
-; CHECK  load <2 x i32>
+; CHECK: load <2 x i32>
 define void @shrink_ptr(i32* %p) {
 entry:
   %gep1 = getelementptr inbounds i32, i32* %p, i64 0
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
new file mode 100644
index 000000000000..c417ba719412
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/const-gep.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck  %s
+
+; Make sure we correctly lower in the presence of getelementptr constant
+; expressions.
+
+@foo = global [5 x <4 x double>] zeroinitializer, align 16
+
+define void @test(i32 %r, i32 %c) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[R:%.*]], i32* [[R_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[C:%.*]], i32* [[C_ADDR]], align 4
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* bitcast ([5 x <4 x double>]* @foo to <2 x double>*), align 8
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i32 0, i32 0, i32 2) to <2 x double>*), align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[COL_LOAD]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK2:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT4:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT3]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <1 x double> [[BLOCK2]], [[SPLAT_SPLAT4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <1 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x double> [[TMP4]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP5]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[COL_LOAD]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> undef, double [[TMP7]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <1 x double> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[COL_LOAD]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <1 x double> undef, double [[TMP9]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT9]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <1 x double> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <1 x double> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK11:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD1]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x double> undef, double [[TMP14]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT12]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <1 x double> [[BLOCK11]], [[SPLAT_SPLAT13]]
+; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD1]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> undef, double [[TMP16]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <1 x double> [[BLOCK14]], [[SPLAT_SPLAT16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <1 x double> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x double> [[TMP18]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> undef, <2 x double> [[TMP19]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[COL_LOAD1]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> undef, double [[TMP21]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul <1 x double> [[BLOCK17]], [[SPLAT_SPLAT19]]
+; CHECK-NEXT:    [[BLOCK20:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> undef, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[COL_LOAD1]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0
+; CHECK-NEXT:    [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul <1 x double> [[BLOCK20]], [[SPLAT_SPLAT22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd <1 x double> [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> [[TMP26]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    store <2 x double> [[COL_LOAD]], <2 x double>* bitcast (double* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i32 0) to <2 x double>*), align 8
+; CHECK-NEXT:    store <2 x double> [[COL_LOAD1]], <2 x double>* bitcast (double* getelementptr ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2, i32 2) to <2 x double>*), align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %r.addr = alloca i32, align 4
+  %c.addr = alloca i32, align 4
+  store i32 %r, i32* %r.addr, align 4
+  store i32 %c, i32* %c.addr, align 4
+  %0 = load <4 x double>, <4 x double>* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 0), align 16
+  %mul = call <4 x double> @llvm.matrix.multiply(<4 x double> %0, <4 x double> %0, i32 2, i32 2, i32 2)
+  store <4 x double> %0, <4 x double>* getelementptr inbounds ([5 x <4 x double>], [5 x <4 x double>]* @foo, i64 0, i64 2), align 16
+  ret void
+}
+
+declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/NewGVN/malloc-load-removal.ll b/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
index 72f4839a5545..c62bac950df2 100644
--- a/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
+++ b/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
@@ -54,3 +54,28 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK_NO_LIBCALLS: load
 ; CHECK_NO_LIBCALLS: icmp
 }
+
+declare i8* @aligned_alloc(i64, i64) nounwind
+
+define noalias i8* @test3() nounwind uwtable ssp {
+entry:
+  %call = tail call i8* @aligned_alloc(i64 256, i64 32) nounwind
+  %0 = load i8, i8* %call, align 32
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i8 0, i8* %call, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i8* %call
+
+; CHECK-LABEL: @test3(
+; CHECK-NOT: load
+; CHECK-NOT: icmp
+
+; CHECK_NO_LIBCALLS-LABEL: @test3(
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: icmp
+}
diff --git a/llvm/test/Transforms/SROA/scalable-vectors.ll b/llvm/test/Transforms/SROA/scalable-vectors.ll
new file mode 100644
index 000000000000..bda54e25b945
--- /dev/null
+++ b/llvm/test/Transforms/SROA/scalable-vectors.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -passes=sroa -S | FileCheck %s
+
+; This test checks that SROA runs mem2reg on scalable vectors.
+
+define <vscale x 16 x i1> @alloca_nxv16i1(<vscale x 16 x i1> %pg) {
+; CHECK-LABEL: alloca_nxv16i1
+; CHECK-NEXT: ret <vscale x 16 x i1> %pg
+  %pg.addr = alloca <vscale x 16 x i1>
+  store <vscale x 16 x i1> %pg, <vscale x 16 x i1>* %pg.addr
+  %1 = load <vscale x 16 x i1>, <vscale x 16 x i1>* %pg.addr
+  ret <vscale x 16 x i1> %1
+}
+
+define <vscale x 16 x i8> @alloca_nxv16i8(<vscale x 16 x i8> %vec) {
+; CHECK-LABEL: alloca_nxv16i8
+; CHECK-NEXT: ret <vscale x 16 x i8> %vec
+  %vec.addr = alloca <vscale x 16 x i8>
+  store <vscale x 16 x i8> %vec, <vscale x 16 x i8>* %vec.addr
+  %1 = load <vscale x 16 x i8>, <vscale x 16 x i8>* %vec.addr
+  ret <vscale x 16 x i8> %1
+}
+
+; Test scalable alloca that can't be promoted. Mem2Reg only considers
+; non-volatile loads and stores for promotion.
+define <vscale x 16 x i8> @unpromotable_alloca(<vscale x 16 x i8> %vec) {
+; CHECK-LABEL: unpromotable_alloca
+; CHECK-NEXT: %vec.addr = alloca <vscale x 16 x i8>
+; CHECK-NEXT: store volatile <vscale x 16 x i8> %vec, <vscale x 16 x i8>* %vec.addr
+; CHECK-NEXT: %1 = load volatile <vscale x 16 x i8>, <vscale x 16 x i8>* %vec.addr
+; CHECK-NEXT: ret <vscale x 16 x i8> %1
+  %vec.addr = alloca <vscale x 16 x i8>
+  store volatile <vscale x 16 x i8> %vec, <vscale x 16 x i8>* %vec.addr
+  %1 = load volatile <vscale x 16 x i8>, <vscale x 16 x i8>* %vec.addr
+  ret <vscale x 16 x i8> %1
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/pr45371-find-either-reset.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/pr45371-find-either-reset.ll
new file mode 100644
index 000000000000..efe426b718eb
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/pr45371-find-either-reset.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -separate-const-offset-from-gep < %s | FileCheck %s
+
+@e = external global [4000 x i8], align 1
+
+define void @find_either_reset() {
+; CHECK-LABEL: @find_either_reset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 65536, undef
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[SUB]] to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[TMP0]], 96
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i8 0 to i64
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i8 [[TMP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4000 x i8], [4000 x i8]* @e, i64 [[IDXPROM]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub = sub nsw i32 65536, undef
+  %0 = trunc i32 %sub to i8
+  %1 = add i8 %0, -4000
+  %arrayidx = getelementptr inbounds [4000 x i8], [4000 x i8]* @e, i8 0, i8 %1
+  ret void
+}
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
new file mode 100644
index 000000000000..d2f23ab7894e
--- /dev/null
+++ b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -0,0 +1,40 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare <4 x float> @llvm.matrix.transpose.v4f32(<4 x float>, i32, i32)
+define <4 x float> @transpose(<4 x float> %m) {
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+  %result.1 = call <4 x float> @llvm.matrix.transpose.v4f32(<4 x float> %m, i32 3, i32 2)
+  %result.2 = call <4 x float> @llvm.matrix.transpose.v4f32(<4 x float> %result.1, i32 2, i32 1)
+  ret <4 x float> %result.2
+}
+
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)
+define <4 x float> @multiply(<4 x float> %m) {
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+  %result.1 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %m, <4 x float> %m, i32 3, i32 2, i32 2)
+  %result.2 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %result.1, <4 x float> %m, i32 2, i32 2, i32 1)
+  ret <4 x float> %result.2
+}
+
+declare <4 x float> @llvm.matrix.columnwise.load.v4f32.p0v4f32(<4 x float>*, i32, i32, i32)
+declare <6 x float> @llvm.matrix.columnwise.load.v6f32.p0v6f32(<6 x float>*, i32, i32, i32)
+define <4 x float> @columnwise_load(<4 x float>* %m, <6 x float>* %n) {
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+  %result.1 = call <4 x float> @llvm.matrix.columnwise.load.v4f32.p0v4f32(<4 x float>* %m, i32 2, i32 1, i32 2)
+  %result.2 = call <6 x float> @llvm.matrix.columnwise.load.v6f32.p0v6f32(<6 x float>* %n, i32 2, i32 3, i32 3)
+  ret <4 x float> %result.1
+}
+
+declare void @llvm.matrix.columnwise.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, i32, i32)
+declare void @llvm.matrix.columnwise.store.v6f32.p0v6f32(<6 x float>, <6 x float>*, i32, i32, i32)
+define void @columnwise_store(<4 x float>* %m, <6 x float>* %n) {
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+; CHECK-NEXT: result of a matrix operation does not fit in the returned vector
+  call void @llvm.matrix.columnwise.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %m, i32 2, i32 1, i32 2)
+  call void @llvm.matrix.columnwise.store.v6f32.p0v6f32(<6 x float> zeroinitializer, <6 x float>* %n, i32 2, i32 3, i32 3)
+  ret void
+}
diff --git a/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-size.ifs b/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-size.ifs
index 9afb08802726..30b7cda9b548 100644
--- a/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-size.ifs
+++ b/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-size.ifs
@@ -1,8 +1,8 @@
 # NOTE: Used by weak-mismatch.ifs
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  foobar: { Type: Object, Size: 2 }
+  - { Name: foobar, Type: Object, Size: 2 }
 ...
diff --git a/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-type.ifs b/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-type.ifs
index 8fc550a644cb..3f8d54c7e536 100644
--- a/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-type.ifs
+++ b/llvm/test/tools/llvm-ifs/Inputs/strong-mismatch-type.ifs
@@ -1,8 +1,8 @@
 # NOTE: Used by weak-mismatch.ifs
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  foobar: { Type: Func }
+  - { Name: foobar, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/conflict-header-format.ifs b/llvm/test/tools/llvm-ifs/conflict-header-format.ifs
index 40ae9c0526f2..4e26fb8080e8 100644
--- a/llvm/test/tools/llvm-ifs/conflict-header-format.ifs
+++ b/llvm/test/tools/llvm-ifs/conflict-header-format.ifs
@@ -5,10 +5,10 @@
 # CHECK-IFS-NEXT: Filenames:
 # CHECK-IFS-NEXT: ObjectFileFormat Values: TBD ELF
 
---- !experimental-ifs-v1
-IfsVersion:      1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple:          x86_64-apple-unknown
 ObjectFileFormat: TBD
 Symbols:
-  a:               { Type: Func }
+  - { Name: a, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/conflict-header-triple.ifs b/llvm/test/tools/llvm-ifs/conflict-header-triple.ifs
index 15bddc6a15f7..9ce04b8b3f31 100644
--- a/llvm/test/tools/llvm-ifs/conflict-header-triple.ifs
+++ b/llvm/test/tools/llvm-ifs/conflict-header-triple.ifs
@@ -5,10 +5,10 @@
 # CHECK-IFS-NEXT: Filenames:
 # CHECK-IFS-NEXT: Triple Values: mips-unknown-linux x86_64-unknown-linux-gnu
 
---- !experimental-ifs-v1
-IfsVersion:      1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple:          mips-unknown-linux
 ObjectFileFormat: ELF
 Symbols:
-  a:               { Type: Func }
+  - { Name: a, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/conflict-header-version.ifs b/llvm/test/tools/llvm-ifs/conflict-header-version.ifs
index addf9943441b..ecdeb311f860 100644
--- a/llvm/test/tools/llvm-ifs/conflict-header-version.ifs
+++ b/llvm/test/tools/llvm-ifs/conflict-header-version.ifs
@@ -5,12 +5,12 @@
 # RUN: FileCheck %s --check-prefixes=CHECK-IFS2
 
 # CHECK-IFS: error: Interface Stub: IfsVersion Mismatch.
-# CHECK-IFS2: error: Interface Stub: Bad IfsVersion: 0.0, llvm-ifs supported version: 1.2.
+# CHECK-IFS2: error: Interface Stub: Bad IfsVersion: 0.0, llvm-ifs supported version: 2.0.
 
---- !experimental-ifs-v1
+--- !experimental-ifs-v2
 IfsVersion:      0.0
 Triple:          x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  a:               { Type: Func }
+  - { Name: a, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/conflict-size.ifs b/llvm/test/tools/llvm-ifs/conflict-size.ifs
index 173ce268c741..5e0fcafd55db 100644
--- a/llvm/test/tools/llvm-ifs/conflict-size.ifs
+++ b/llvm/test/tools/llvm-ifs/conflict-size.ifs
@@ -7,10 +7,10 @@
 # CHECK-IFS-NEXT: Filename:
 # CHECK-IFS-NEXT: Size Values: 1 4
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  b: { Type: Object, Size: 1 }
+  - { Name: b, Type: Object, Size: 1 }
 ...
diff --git a/llvm/test/tools/llvm-ifs/conflict-type.ifs b/llvm/test/tools/llvm-ifs/conflict-type.ifs
index c518be4e1411..1a10ea79a41c 100644
--- a/llvm/test/tools/llvm-ifs/conflict-type.ifs
+++ b/llvm/test/tools/llvm-ifs/conflict-type.ifs
@@ -7,10 +7,10 @@
 # CHECK-IFS-NEXT: Filename:
 # CHECK-IFS-NEXT: Type Values: Object Func
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  a: { Type: Object, Size: 1 }
+  - { Name: a, Type: Object, Size: 1 }
 ...
diff --git a/llvm/test/tools/llvm-ifs/conflict-weak.ifs b/llvm/test/tools/llvm-ifs/conflict-weak.ifs
index 823b8f1866c3..23eb73d7535f 100644
--- a/llvm/test/tools/llvm-ifs/conflict-weak.ifs
+++ b/llvm/test/tools/llvm-ifs/conflict-weak.ifs
@@ -2,12 +2,12 @@
 # RUN: FileCheck %s --check-prefixes=CHECK-IFS
 
 # CHECK-IFS: Symbols:
-# CHECK-IFS-NEXT: a: { Type: Func, Weak: true }
+# CHECK-IFS-NEXT: - { Name: a, Type: Func, Weak: true }
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  a: { Type: Func, Weak: true }
+  - { Name: a, Type: Func, Weak: true }
 ...
diff --git a/llvm/test/tools/llvm-ifs/default-empty.ifs b/llvm/test/tools/llvm-ifs/default-empty.ifs
index 9848f418f58c..c61f29a37976 100644
--- a/llvm/test/tools/llvm-ifs/default-empty.ifs
+++ b/llvm/test/tools/llvm-ifs/default-empty.ifs
@@ -1,25 +1,25 @@
 # RUN: llvm-ifs -action write-ifs -o - %s | FileCheck --check-prefixes=CHECK-DEFAULT %s
 # RUN: llvm-ifs -action write-ifs -o - %s %S/weak.ifs | FileCheck --check-prefixes=CHECK-MERGE %s
 
-# CHECK-DEFAULT: --- !experimental-ifs-v1
-# CHECK-DEFAULT-NEXT: IfsVersion:      1.2
+# CHECK-DEFAULT: --- !experimental-ifs-v2
+# CHECK-DEFAULT-NEXT: IfsVersion: 2.0
 # CHECK-DEFAULT-NEXT: Triple:          ''
 # CHECK-DEFAULT-NEXT: ObjectFileFormat: ELF
-# CHECK-DEFAULT-NEXT: Symbols:         {}
+# CHECK-DEFAULT-NEXT: Symbols:         []
 # CHECK-DEFAULT-NEXT: ...
 
-# CHECK-MERGE: --- !experimental-ifs-v1
-# CHECK-MERGE-NEXT: IfsVersion: 1.0
+# CHECK-MERGE: --- !experimental-ifs-v2
+# CHECK-MERGE-NEXT: IfsVersion: 2.0
 # CHECK-MERGE-NEXT: Triple: x86_64-unknown-linux-gnu
 # CHECK-MERGE-NEXT: ObjectFileFormat: ELF
 # CHECK-MERGE-NEXT: Symbols:
-# CHECK-MERGE-DAG:   _Z8weakFuncv: { Type: Func, Weak: true }
-# CHECK-MERGE-DAG:   _Z10strongFuncv: { Type: Func }
+# CHECK-MERGE-DAG:  - { Name: _Z8weakFuncv, Type: Func, Weak: true }
+# CHECK-MERGE-DAG:  - { Name: _Z10strongFuncv, Type: Func }
 # CHECK-MERGE: ...
 
---- !experimental-ifs-v1
-IfsVersion:      1.2
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple:          ''
 ObjectFileFormat: ELF
-Symbols:         {}
+Symbols:         []
 ...
diff --git a/llvm/test/tools/llvm-ifs/empty1.ifs b/llvm/test/tools/llvm-ifs/empty1.ifs
new file mode 100644
index 000000000000..d237dd7ea10a
--- /dev/null
+++ b/llvm/test/tools/llvm-ifs/empty1.ifs
@@ -0,0 +1,15 @@
+# RUN: llvm-ifs -action write-ifs -o - %s | FileCheck %s
+
+# CHECK: --- !experimental-ifs-v2
+# CHECK-NEXT: IfsVersion: 2.0
+# CHECK-NEXT: Triple: x86_64-unknown-linux-gnu
+# CHECK-NEXT: ObjectFileFormat: ELF
+# CHECK-NEXT: Symbols: []
+# CHECK: ...
+
+--- !experimental-ifs-v2
+IfsVersion:      2.0
+Triple:          x86_64-unknown-linux-gnu
+ObjectFileFormat: ELF
+Symbols:         []
+...
diff --git a/llvm/test/tools/llvm-ifs/empty2.ifs b/llvm/test/tools/llvm-ifs/empty2.ifs
new file mode 100644
index 000000000000..a294c777bbf9
--- /dev/null
+++ b/llvm/test/tools/llvm-ifs/empty2.ifs
@@ -0,0 +1,15 @@
+# RUN: llvm-ifs -action write-ifs -o - %s | FileCheck %s
+
+# CHECK: --- !experimental-ifs-v2
+# CHECK-NEXT: IfsVersion: 2.0
+# CHECK-NEXT: Triple: x86_64-unknown-linux-gnu
+# CHECK-NEXT: ObjectFileFormat: ELF
+# CHECK-NEXT: Symbols: []
+# CHECK: ...
+
+--- !experimental-ifs-v2
+IfsVersion:      2.0
+Triple:          x86_64-unknown-linux-gnu
+ObjectFileFormat: ELF
+Symbols:
+...
diff --git a/llvm/test/tools/llvm-ifs/func.ifs b/llvm/test/tools/llvm-ifs/func.ifs
index 496e26241922..d6d85782b2e3 100644
--- a/llvm/test/tools/llvm-ifs/func.ifs
+++ b/llvm/test/tools/llvm-ifs/func.ifs
@@ -10,13 +10,13 @@
 # RUN: llvm-ifs -action write-ifs -o - %s %s | \
 # RUN: FileCheck %s --check-prefixes=CHECK-MERGE-IFS
 
-# CHECK-IFS: --- !experimental-ifs-v1
-# CHECK-IFS-NEXT: IfsVersion:      1.0
+# CHECK-IFS: --- !experimental-ifs-v2
+# CHECK-IFS-NEXT: IfsVersion: 2.0
 # CHECK-IFS-NEXT: Triple:          x86_64-unknown-linux-gnu
 # CHECK-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-IFS-NEXT: Symbols:
-# CHECK-IFS-DAG:   a:               { Type: Func }
-# CHECK-IFS-DAG:   b:               { Type: Object, Size: 4 }
+# CHECK-IFS-DAG:   - { Name: a, Type: Func }
+# CHECK-IFS-DAG:   - { Name: b, Type: Object, Size: 4 }
 # CHECK-IFS: ...
 
 # CHECK-ELF: ELF Header:
@@ -39,18 +39,18 @@
 # CHECK-DARWIN-TBD3-NEXT: ...
 
 # Here we are testing to see if two identical symbols will merge.
-# CHECK-MERGE-IFS: --- !experimental-ifs-v1
-# CHECK-MERGE-IFS-NEXT: IfsVersion:      1.0
+# CHECK-MERGE-IFS: --- !experimental-ifs-v2
+# CHECK-MERGE-IFS-NEXT: IfsVersion: 2.0
 # CHECK-MERGE-IFS-NEXT: Triple:          x86_64-unknown-linux-gnu
 # CHECK-MERGE-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-MERGE-IFS-NEXT: Symbols:
-# CHECK-MERGE-IFS-NEXT:   a:               { Type: Func }
+# CHECK-MERGE-IFS-NEXT:   - { Name: a, Type: Func }
 # CHECK-MERGE-IFS-NEXT: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  a: { Type: Func }
+  - { Name: a, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/ios-tbd.ifs b/llvm/test/tools/llvm-ifs/ios-tbd.ifs
index 13671b02c5cb..5b21aedf6500 100644
--- a/llvm/test/tools/llvm-ifs/ios-tbd.ifs
+++ b/llvm/test/tools/llvm-ifs/ios-tbd.ifs
@@ -13,10 +13,10 @@
 # CHECK-NEXT:     symbols:         [ __Z3fooi ]
 # CHECK-NEXT: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: arm64-apple-ios
 ObjectFileFormat: TBD
 Symbols:
-  __Z3fooi: { Type: Func }
+  - { Name: __Z3fooi, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/macos-tbd.ifs b/llvm/test/tools/llvm-ifs/macos-tbd.ifs
index bd84806fb219..b04828b2a39d 100644
--- a/llvm/test/tools/llvm-ifs/macos-tbd.ifs
+++ b/llvm/test/tools/llvm-ifs/macos-tbd.ifs
@@ -13,10 +13,10 @@
 # CHECK-NEXT:     symbols:         [ __Z3fooi ]
 # CHECK-NEXT: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: arm64-apple-macosx
 ObjectFileFormat: TBD
 Symbols:
-  __Z3fooi: { Type: Func }
+  - { Name: __Z3fooi, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/object-function-size-weak-combo.ifs b/llvm/test/tools/llvm-ifs/object-function-size-weak-combo.ifs
index b6328fbc58d9..769f423f328a 100644
--- a/llvm/test/tools/llvm-ifs/object-function-size-weak-combo.ifs
+++ b/llvm/test/tools/llvm-ifs/object-function-size-weak-combo.ifs
@@ -4,17 +4,17 @@
 # RUN: llvm-ifs -action write-bin -o - %s %S/func.ifs %S/object.ifs %S/weak.ifs | \
 # RUN: llvm-readelf --all | FileCheck %s --check-prefixes=CHECK-ELF
 
-# CHECK-IFS: --- !experimental-ifs-v1
-# CHECK-IFS-NEXT: IfsVersion:      1.0
+# CHECK-IFS: --- !experimental-ifs-v2
+# CHECK-IFS-NEXT: IfsVersion: 2.0
 # CHECK-IFS-NEXT: Triple:          x86_64-unknown-linux-gnu
 # CHECK-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-IFS-NEXT: Symbols:
-# CHECK-IFS-DAG:   e:               { Type: Object, Size: 8 }
-# CHECK-IFS-DAG:   a:               { Type: Func }
-# CHECK-IFS-DAG:   f:               { Type: Object, Size: 2 }
-# CHECK-IFS-DAG:   _Z10strongFuncv: { Type: Func }
-# CHECK-IFS-DAG:   _Z8weakFuncv:    { Type: Func, Weak: true }
-# CHECK-IFS-DAG:   b:               { Type: Object, Size: 4 }
+# CHECK-IFS-DAG:   - { Name: e,              Type: Object, Size: 8 }
+# CHECK-IFS-DAG:   - { Name: a,              Type: Func }
+# CHECK-IFS-DAG:   - { Name: f,              Type: Object, Size: 2 }
+# CHECK-IFS-DAG:   - { Name: _Z10strongFuncv, Type: Func }
+# CHECK-IFS-DAG:   - { Name: _Z8weakFuncv,   Type: Func, Weak: true }
+# CHECK-IFS-DAG:   - { Name: b,              Type: Object, Size: 4 }
 # CHECK-IFS: ...
 
 # CHECK-ELF: FUNC    GLOBAL DEFAULT  1 _Z10strongFuncv
@@ -24,11 +24,11 @@
 # CHECK-ELF: OBJECT  GLOBAL DEFAULT  1 e
 # CHECK-ELF: OBJECT  GLOBAL DEFAULT  1 f
 
---- !experimental-ifs-v1
-IfsVersion:      1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple:          x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  e:               { Type: Object, Size: 8 }
-  f:               { Type: Object, Size: 2 }
+  - { Name: e, Type: Object, Size: 8 }
+  - { Name: f, Type: Object, Size: 2 }
 ...
diff --git a/llvm/test/tools/llvm-ifs/object.ifs b/llvm/test/tools/llvm-ifs/object.ifs
index 733cc38001d3..c4823c20fce2 100644
--- a/llvm/test/tools/llvm-ifs/object.ifs
+++ b/llvm/test/tools/llvm-ifs/object.ifs
@@ -4,12 +4,12 @@
 # RUN: llvm-ifs -action write-bin -o - %s | \
 # RUN: llvm-readelf --all | FileCheck %s --check-prefixes=CHECK-ELF
 
-# CHECK-IFS: --- !experimental-ifs-v1
-# CHECK-IFS-NEXT: IfsVersion:      1.0
+# CHECK-IFS: --- !experimental-ifs-v2
+# CHECK-IFS-NEXT: IfsVersion: 2.0
 # CHECK-IFS-NEXT: Triple:          x86_64-unknown-linux-gnu
 # CHECK-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-IFS-NEXT: Symbols:
-# CHECK-IFS-NEXT:   b:               { Type: Object, Size: 4 }
+# CHECK-IFS-NEXT:   - { Name: b, Type: Object, Size: 4 }
 # CHECK-IFS-NEXT: ...
 
 # CHECK-ELF: ELF Header:
@@ -19,10 +19,10 @@
 # CHECK-ELF-NOT:   FUNC    GLOBAL DEFAULT  1 a
 # CHECK-ELF:   OBJECT  GLOBAL DEFAULT  1 b
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  b: { Type: Object, Size: 4 }
+  - { Name: b, Type: Object, Size: 4 }
 ...
diff --git a/llvm/test/tools/llvm-ifs/strong.ifs b/llvm/test/tools/llvm-ifs/strong.ifs
index bdc930fbaaa3..ccc1f9e5d8b6 100644
--- a/llvm/test/tools/llvm-ifs/strong.ifs
+++ b/llvm/test/tools/llvm-ifs/strong.ifs
@@ -1,17 +1,17 @@
 # RUN: llvm-ifs -action write-ifs -o - %s %S/strong.ifs | FileCheck %s --check-prefixes=CHECK-IFS
 
-# CHECK-IFS: --- !experimental-ifs-v1
-# CHECK-IFS-NEXT: IfsVersion: 1.0
+# CHECK-IFS: --- !experimental-ifs-v2
+# CHECK-IFS-NEXT: IfsVersion: 2.0
 # CHECK-IFS-NEXT: Triple: x86_64-unknown-linux-gnu
 # CHECK-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-IFS-NEXT: Symbols:
-# CHECK-IFS-DAG:   _Z8weakFuncv: { Type: Func }
+# CHECK-IFS-DAG:   - { Name: _Z8weakFuncv, Type: Func }
 # CHECK-IFS: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  _Z8weakFuncv: { Type: Func }
+  - { Name: _Z8weakFuncv, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/tvos-tbd.ifs b/llvm/test/tools/llvm-ifs/tvos-tbd.ifs
index 08c8478c1daf..6db01bf6162f 100644
--- a/llvm/test/tools/llvm-ifs/tvos-tbd.ifs
+++ b/llvm/test/tools/llvm-ifs/tvos-tbd.ifs
@@ -13,10 +13,10 @@
 # CHECK-NEXT:     symbols:         [ __Z3fooi ]
 # CHECK-NEXT: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: arm64-apple-tvos
 ObjectFileFormat: TBD
 Symbols:
-  __Z3fooi: { Type: Func }
+  - { Name: __Z3fooi, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/version-ok.ifs b/llvm/test/tools/llvm-ifs/version-ok.ifs
index fd150ee77d55..646b8624feb1 100644
--- a/llvm/test/tools/llvm-ifs/version-ok.ifs
+++ b/llvm/test/tools/llvm-ifs/version-ok.ifs
@@ -1,9 +1,9 @@
 # RUN: llvm-ifs -action write-ifs -o - %s %S/object.ifs
 
---- !experimental-ifs-v1
-IfsVersion:      1.1
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple:          x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  a:               { Type: Func }
+  - { Name: a, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/watchos-tbd.ifs b/llvm/test/tools/llvm-ifs/watchos-tbd.ifs
index 74a9d962a3e0..fcb914265202 100644
--- a/llvm/test/tools/llvm-ifs/watchos-tbd.ifs
+++ b/llvm/test/tools/llvm-ifs/watchos-tbd.ifs
@@ -13,10 +13,10 @@
 # CHECK-NEXT:     symbols:         [ __Z3fooi ]
 # CHECK-NEXT: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: arm64-apple-watchos
 ObjectFileFormat: TBD
 Symbols:
-  __Z3fooi: { Type: Func }
+  - { Name: __Z3fooi, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-ifs/weak-mismatch.ifs b/llvm/test/tools/llvm-ifs/weak-mismatch.ifs
index 15abc2064cc2..cf45dff8c062 100644
--- a/llvm/test/tools/llvm-ifs/weak-mismatch.ifs
+++ b/llvm/test/tools/llvm-ifs/weak-mismatch.ifs
@@ -10,10 +10,10 @@
 # CHECK-TYPE-NEXT: Filename:
 # CHECK-TYPE-NEXT: Type Values: Object Func
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  foobar: { Type: Object, Size: 1, Weak: true }
+  - { Name: foobar, Type: Object, Size: 1, Weak: true }
 ...
diff --git a/llvm/test/tools/llvm-ifs/weak.ifs b/llvm/test/tools/llvm-ifs/weak.ifs
index a7441be1c1f2..bf8091050530 100644
--- a/llvm/test/tools/llvm-ifs/weak.ifs
+++ b/llvm/test/tools/llvm-ifs/weak.ifs
@@ -1,19 +1,19 @@
 # RUN: llvm-ifs -action write-ifs -o - %s | FileCheck %s --check-prefixes=CHECK-IFS
 
-# CHECK-IFS: --- !experimental-ifs-v1
-# CHECK-IFS-NEXT: IfsVersion: 1.0
+# CHECK-IFS: --- !experimental-ifs-v2
+# CHECK-IFS-NEXT: IfsVersion: 2.0
 # CHECK-IFS-NEXT: Triple: x86_64-unknown-linux-gnu
 # CHECK-IFS-NEXT: ObjectFileFormat: ELF
 # CHECK-IFS-NEXT: Symbols:
-# CHECK-IFS-DAG:   _Z8weakFuncv: { Type: Func, Weak: true }
-# CHECK-IFS-DAG:   _Z10strongFuncv: { Type: Func }
+# CHECK-IFS-DAG:   - { Name: _Z8weakFuncv, Type: Func, Weak: true }
+# CHECK-IFS-DAG:   - { Name: _Z10strongFuncv, Type: Func }
 # CHECK-IFS: ...
 
---- !experimental-ifs-v1
-IfsVersion: 1.0
+--- !experimental-ifs-v2
+IfsVersion: 2.0
 Triple: x86_64-unknown-linux-gnu
 ObjectFileFormat: ELF
 Symbols:
-  _Z8weakFuncv: { Type: Func, Weak: true }
-  _Z10strongFuncv: { Type: Func }
+  - { Name: _Z8weakFuncv, Type: Func, Weak: true }
+  - { Name: _Z10strongFuncv, Type: Func }
 ...
diff --git a/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s b/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s
index 73e3a68c6d9c..f04b7e5c0776 100644
--- a/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s
+++ b/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s
@@ -29,7 +29,9 @@ b:
   b .+4
 
 # CHECK-LABEL: <bt>:
-# CHECK-NEXT:   bt 2, .+65532
+# CHECK-NEXT:   18: bt 2, 0x14
+# CHECK-NEXT:   1c: bt 1, 0x20
 
 bt:
   bt 2, .-4
+  bgt .+4
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
index 5a229caeb482..1dee2aa2d52a 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
@@ -55,7 +55,7 @@ CHECK:        000000a4 <a>:
 CHECK-NEXT:        ...
 CHECK:        Disassembly of section .tdata:
 CHECK:        00000000 <d>:
-CHECK-NEXT:        0: 40 09 21 f9                  	bdnzfl	9, .+8696
+CHECK-NEXT:        0: 40 09 21 f9                  	bdnzfl	9, 0x21f8
 CHECK-NEXT:        4: f0 1b 86 6e                  	<unknown>
 CHECK:        Disassembly of section .tbss:
 CHECK:        00000008 <c>:
diff --git a/llvm/test/tools/llvm-readobj/ELF/RISCV/attribute.s b/llvm/test/tools/llvm-readobj/ELF/RISCV/attribute.s
new file mode 100644
index 000000000000..5ae7bfcb0290
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/RISCV/attribute.s
@@ -0,0 +1,44 @@
+## Test llvm-readobj & llvm-readelf can decode RISC-V attributes correctly.
+
+# RUN: llvm-mc -triple riscv32 -filetype obj -o %t.rv32.o %s
+# RUN: llvm-mc -triple riscv64 -filetype obj -o %t.rv64.o %s
+# RUN: llvm-readobj --arch-specific %t.rv32.o \
+# RUN:   | FileCheck %s --check-prefix=CHECK-OBJ
+# RUN: llvm-readelf -A %t.rv32.o \
+# RUN:   | FileCheck %s --check-prefix=CHECK-OBJ
+# RUN: llvm-readobj --arch-specific %t.rv64.o \
+# RUN:   | FileCheck %s --check-prefix=CHECK-OBJ
+# RUN: llvm-readelf -A %t.rv64.o \
+# RUN:   | FileCheck %s --check-prefix=CHECK-OBJ
+
+.attribute  Tag_stack_align, 16
+# CHECK-OBJ:      Tag: 4
+# CHECK-OBJ-NEXT: Value: 16
+# CHECK-OBJ-NEXT: TagName: stack_align
+# CHECK-OBJ-NEXT: Description: Stack alignment is 16-bytes
+
+.attribute  Tag_arch, "rv32i2p0_m2p0_a2p0_c2p0"
+# CHECK-OBJ:      Tag: 5
+# CHECK-OBJ-NEXT: TagName: arch
+# CHECK-OBJ-NEXT: Value: rv32i2p0_m2p0_a2p0_c2p0
+
+.attribute  Tag_unaligned_access, 0
+# CHECK-OBJ:      Tag: 6
+# CHECK-OBJ-NEXT: Value: 0
+# CHECK-OBJ-NEXT: TagName: unaligned_access
+# CHECK-OBJ-NEXT: Description: No unaligned access
+
+.attribute  Tag_priv_spec, 2
+# CHECK-OBJ:      Tag: 8
+# CHECK-OBJ-NEXT: TagName: priv_spec
+# CHECK-OBJ-NEXT: Value: 2
+
+.attribute  Tag_priv_spec_minor, 0
+# CHECK-OBJ:      Tag: 10
+# CHECK-OBJ-NEXT: TagName: priv_spec_minor
+# CHECK-OBJ-NEXT: Value: 0
+
+.attribute  Tag_priv_spec_revision, 0
+# CHECK-OBJ:      Tag: 12
+# CHECK-OBJ-NEXT: TagName: priv_spec_revision
+# CHECK-OBJ-NEXT: Value: 0
diff --git a/llvm/test/tools/llvm-readobj/ELF/RISCV/invalid-attr-section-size.test b/llvm/test/tools/llvm-readobj/ELF/RISCV/invalid-attr-section-size.test
new file mode 100644
index 000000000000..524134e1579b
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/RISCV/invalid-attr-section-size.test
@@ -0,0 +1,20 @@
+## This test case is used to ensure the error code is caught by llvm-readobj.
+
+# RUN: yaml2obj %s -D BITS=32 -o %t.32.o
+# RUN: llvm-readobj -A %t.32.o 2>&1 | FileCheck -DFILE=%t %s
+# RUN: yaml2obj %s -D BITS=64 -o %t.64.o
+# RUN: llvm-readobj -A %t.64.o 2>&1 | FileCheck -DFILE=%t %s
+
+# CHECK: warning: '[[FILE]].{{32|64}}.o': invalid section length 0 at offset 0x1
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS[[BITS]]
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_RISCV
+Sections:
+  - Name:    .riscv.attributes
+    Type:    SHT_RISCV_ATTRIBUTES
+## Version: 'A'(0x41), section length: 0
+    Content: 4100000000
diff --git a/llvm/test/tools/llvm-readobj/ELF/RISCV/invalid-attr-version.test b/llvm/test/tools/llvm-readobj/ELF/RISCV/invalid-attr-version.test
new file mode 100644
index 000000000000..9a4d81bcc4f1
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/RISCV/invalid-attr-version.test
@@ -0,0 +1,21 @@
+## This test case is used to ensure llvm-readobj checks the version of
+## attribute sections correctly.
+
+# RUN: yaml2obj %s -D BITS=32 -o %t.32.o
+# RUN: llvm-readobj -A %t.32.o 2>&1 | FileCheck -DFILE=%t %s
+# RUN: yaml2obj %s -D BITS=64 -o %t.64.o
+# RUN: llvm-readobj -A %t.64.o 2>&1 | FileCheck -DFILE=%t %s
+
+# CHECK: warning: '[[FILE]].{{32|64}}.o': unrecognised FormatVersion: 0x42
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS[[BITS]]
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_RISCV
+Sections:
+  - Name:    .riscv.attributes
+    Type:    SHT_RISCV_ATTRIBUTES
+## Version: 'B'
+    Content: 42
diff --git a/llvm/test/tools/llvm-readobj/ELF/RISCV/lit.local.cfg b/llvm/test/tools/llvm-readobj/ELF/RISCV/lit.local.cfg
new file mode 100644
index 000000000000..c63820126f8c
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/RISCV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'RISCV' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/tools/llvm-readobj/ELF/RISCV/section-types.test b/llvm/test/tools/llvm-readobj/ELF/RISCV/section-types.test
new file mode 100644
index 000000000000..d1f35306afb2
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/RISCV/section-types.test
@@ -0,0 +1,21 @@
+## Show that all RISCV specific section types are properly printed for both
+## LLVM and GNU styles.
+
+# RUN: yaml2obj %s -o %t-riscv.o
+# RUN: llvm-readobj --section-headers %t-riscv.o | FileCheck %s --check-prefix=LLVM
+# RUN: llvm-readelf --section-headers %t-riscv.o | FileCheck %s --check-prefix=GNU
+
+# LLVM:      Name: .riscv.attributes (1)
+# LLVM-NEXT: Type: SHT_RISCV_ATTRIBUTES (0x70000003)
+
+# GNU: [ 1] .riscv.attributes RISCV_ATTRIBUTES
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_RISCV
+Sections:
+  - Name: .riscv.attributes
+    Type: SHT_RISCV_ATTRIBUTES
diff --git a/llvm/test/tools/llvm-readobj/ELF/RISCV/validate-attr-section.test b/llvm/test/tools/llvm-readobj/ELF/RISCV/validate-attr-section.test
new file mode 100644
index 000000000000..66a5a7a8d31f
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/RISCV/validate-attr-section.test
@@ -0,0 +1,17 @@
+## We only implement attribute section printing for little-endian encoding.
+
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-readobj -A %t.o | FileCheck %s
+
+# CHECK: Attributes not implemented.
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+## Test big-endian encoding.
+  Data:    ELFDATA2MSB
+  Type:    ET_REL
+  Machine: EM_RISCV
+Sections:
+  - Name: .riscv.attributes
+    Type: SHT_RISCV_ATTRIBUTES
diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
index cdbec32efa24..f7551b481a86 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
@@ -112,3 +112,98 @@ ProgramHeaders:
     Sections:
       - Section: .hash
       - Section: .dynamic
+
+## Each SHT_HASH section starts with two 32-bit fields: nbucket and nchain.
+## Check we report an error when a DT_HASH value points to data that has size less than 8 bytes.
+
+# RUN: yaml2obj --docnum=3 %s -o %t3.o
+# RUN: llvm-readelf --elf-hash-histogram %t3.o 2>&1 | FileCheck %s --check-prefix=ERR1 -DFILE=%t3.o
+
+# ERR1: warning: '[[FILE]]': the hash table at offset 0x2b1 goes past the end of the file (0x2b8){{$}}
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:   .hash
+    Type:   SHT_HASH
+    Flags:  [ SHF_ALLOC ]
+    Bucket: [ 0 ]
+    Chain:  [ 0 ]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Entries:
+      - Tag:   DT_HASH
+        Value: 0x239
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols: []
+ProgramHeaders:
+  - Type:     PT_LOAD
+    FileSize: 0x23a
+    Sections:
+      - Section: .hash
+      - Section: .dynamic
+
+## Check we report a warning when the hash table goes past the end of the file.
+
+## Case A.1: the hash table ends right before the EOF. We have a broken nbucket
+##           field that has a value larger than the number of buckets.
+# RUN: yaml2obj --docnum=4 %s -o %t4.1.o -DNBUCKET=0x5d -DNCHAIN=0x1
+# RUN: llvm-readelf --elf-hash-histogram %t4.1.o 2>&1 | \
+# RUN:   FileCheck %s --implicit-check-not={{.}} --allow-empty
+
+## Case A.2: the hash table ends 1 byte past the EOF. We have a broken nbucket
+##           field that has a value larger than the number of buckets.
+# RUN: yaml2obj --docnum=4 %s -o %t4.2.o -DNBUCKET=0x5e -DNCHAIN=0x1
+# RUN: llvm-readelf --elf-hash-histogram %t4.2.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2 -DFILE=%t4.2.o --implicit-check-not="warning:"
+# ERR2: warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 94, nchain = 1{{$}}
+
+## Case B.1: the hash table ends right before the EOF. We have a broken nchain
+##           field that has a value larger than the number of chains.
+# RUN: yaml2obj --docnum=4 %s -o %t4.3.o -DNBUCKET=0x1 -DNCHAIN=0x5d
+# RUN: llvm-readelf --elf-hash-histogram %t4.3.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR3 -DFILE=%t4.3.o --implicit-check-not="warning:"
+# ERR3: warning: '[[FILE]]': hash table nchain (93) differs from symbol count derived from SHT_DYNSYM section header (1){{$}}
+
+## Case B.2: the hash table ends 1 byte past the EOF. We have a broken nchain
+##           field that has a value larger than the number of chains.
+# RUN: yaml2obj --docnum=4 %s -o %t4.4.o -DNBUCKET=0x1 -DNCHAIN=0x5e
+# RUN: llvm-readelf --elf-hash-histogram %t4.4.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR4 -DFILE=%t4.4.o --implicit-check-not="warning:"
+# ERR4: warning: '[[FILE]]': hash table nchain (94) differs from symbol count derived from SHT_DYNSYM section header (1){{$}}
+# ERR4: warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 1, nchain = 94{{$}}
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .hash
+    Type:    SHT_HASH
+    Flags:   [ SHF_ALLOC ]
+    Bucket:  [ 0 ]
+    NBucket: [[NBUCKET]]
+    Chain:   [ 0 ]
+    NChain:  [[NCHAIN]]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Entries:
+      - Tag:   DT_HASH
+        Value: 0x0
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols: []
+ProgramHeaders:
+  - Type: PT_LOAD
+    Sections:
+      - Section: .hash
+      - Section: .dynamic
diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test b/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test
index 9434da1882a8..40e680e3b751 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hash-symbols.test
@@ -347,3 +347,104 @@ ProgramHeaders:
     Sections:
       - Section: .hash
       - Section: .dynamic
+
+## Each SHT_HASH section starts with two 32-bit fields: nbucket and nchain.
+## Check we report an error when a DT_HASH value points to data that has size less than 8 bytes.
+
+# RUN: yaml2obj --docnum=6 %s -o %t6.o
+# RUN: llvm-readelf --hash-symbols %t6.o 2>&1 | FileCheck %s --check-prefix=ERR1 -DFILE=%t6.o
+
+# ERR1: warning: '[[FILE]]': the hash table at offset 0x2b1 goes past the end of the file (0x2b8){{$}}
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:   .hash
+    Type:   SHT_HASH
+    Flags:  [ SHF_ALLOC ]
+    Bucket: [ 0 ]
+    Chain:  [ 0 ]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Entries:
+      - Tag:   DT_HASH
+        Value: 0x239
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols: []
+ProgramHeaders:
+  - Type:     PT_LOAD
+    FileSize: 0x23a
+    Sections:
+      - Section: .hash
+      - Section: .dynamic
+
+## Check we report a warning when the hash table goes past the end of the file.
+
+## Case A.1: the hash table ends right before the EOF. We have a broken nbucket
+##           field that has a value larger than the number of buckets.
+# RUN: yaml2obj --docnum=7 %s -o %t7.1.o -DNBUCKET=0x5d -DNCHAIN=0x1
+# RUN: llvm-readelf --hash-symbols %t7.1.o 2>&1 | FileCheck %s --check-prefix=NOERR1
+# NOERR1:            Symbol table of .hash for image:
+# NOERR1-NEXT:         Num Buc:    Value  Size   Type   Bind Vis      Ndx Name
+# NOERR1-NEXT-EMPTY:
+
+## Case A.2: the hash table ends 1 byte past the EOF. We have a broken nbucket
+##           field that has a value larger than the number of buckets.
+# RUN: yaml2obj --docnum=7 %s -o %t7.2.o -DNBUCKET=0x5e -DNCHAIN=0x1
+# RUN: llvm-readelf --hash-symbols %t7.2.o 2>&1 | FileCheck %s --check-prefix=ERR2 -DFILE=%t7.2.o
+# ERR2:      Symbol table of .hash for image:
+# ERR2-NEXT: warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 94, nchain = 1{{$}}
+# ERR2-NOT:  {{.}}
+
+## Case B.1: the hash table ends right before the EOF. We have a broken nchain
+##           field that has a value larger than the number of chains.
+# RUN: yaml2obj --docnum=7 %s -o %t7.3.o -DNBUCKET=0x1 -DNCHAIN=0x5d
+# RUN: llvm-readelf --hash-symbols %t7.3.o 2>&1 | \
+# RUN:   FileCheck %s --implicit-check-not="warning:" --check-prefix=NOERR2 -DFILE=%t7.3.o
+# NOERR2:      warning: '[[FILE]]': hash table nchain (93) differs from symbol count derived from SHT_DYNSYM section header (1)
+# NOERR2:      Symbol table of .hash for image:
+# NOERR2-NEXT: Num Buc: Value Size Type Bind Vis Ndx Name
+# NOERR2-NOT:  {{.}}
+
+## Case B.2: the hash table ends 1 byte past the EOF. We have a broken nchain
+##           field that has a value larger than the number of chains.
+# RUN: yaml2obj --docnum=7 %s -o %t7.4.o -DNBUCKET=0x1 -DNCHAIN=0x5e
+# RUN: llvm-readelf --hash-symbols %t7.4.o 2>&1 | FileCheck %s --check-prefix=ERR3 -DFILE=%t7.4.o
+# ERR3:      Symbol table of .hash for image:
+# ERR3-NEXT: warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 1, nchain = 94{{$}}
+# ERR3-NOT:  {{.}}
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .hash
+    Type:    SHT_HASH
+    Flags:   [ SHF_ALLOC ]
+    Bucket:  [ 0 ]
+    NBucket: [[NBUCKET]]
+    Chain:   [ 0 ]
+    NChain:  [[NCHAIN]]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Entries:
+      - Tag:   DT_HASH
+        Value: 0x0
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols: []
+ProgramHeaders:
+  - Type: PT_LOAD
+    Sections:
+      - Section: .hash
+      - Section: .dynamic
diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-table.test b/llvm/test/tools/llvm-readobj/ELF/hash-table.test
index 2abfdd01baf2..8cbe615eee22 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hash-table.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hash-table.test
@@ -115,3 +115,133 @@ ProgramHeaders:
     VAddr: 0x1010
     Sections:
       - Section: .dynamic
+
+## Each SHT_HASH section starts with two 32-bit fields: nbucket and nchain.
+## Check we report an error when a DT_HASH value points to data that has size less than 8 bytes.
+
+# RUN: yaml2obj --docnum=4 %s -o %t4.o
+# RUN: llvm-readelf --hash-table %t4.o 2>&1 | FileCheck %s --check-prefix=ERR1 -DFILE=%t4.o
+# RUN: llvm-readobj --hash-table %t4.o 2>&1 | FileCheck %s --check-prefix=ERR1 -DFILE=%t4.o
+
+# ERR1:      HashTable {
+# ERR1-NEXT:  warning: '[[FILE]]': the hash table at offset 0x2b1 goes past the end of the file (0x2b8){{$}}
+# ERR1-NEXT: }
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:   .hash
+    Type:   SHT_HASH
+    Flags:  [ SHF_ALLOC ]
+    Bucket: [ 0 ]
+    Chain:  [ 0 ]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Entries:
+      - Tag:   DT_HASH
+        Value: 0x239
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols: []
+ProgramHeaders:
+  - Type:     PT_LOAD
+    FileSize: 0x23a
+    Sections:
+      - Section: .hash
+      - Section: .dynamic
+
+## Check we report a warning when the hash table goes past the end of the file.
+
+## Case A.1: the hash table ends right before the EOF. We have a broken nbucket
+##           field that has a value larger than the number of buckets.
+# RUN: yaml2obj --docnum=5 %s -o %t5.1.o -DNBUCKET=0x5d -DNCHAIN=0x1
+# RUN: llvm-readelf --hash-table %t5.1.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=NOERR1 --implicit-check-not="warning:"
+# RUN: llvm-readobj --hash-table %t5.1.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=NOERR1 --implicit-check-not="warning:"
+
+# NOERR1:      HashTable {
+# NOERR1-NEXT:  Num Buckets: 93
+# NOERR1-NEXT:  Num Chains: 1
+## Here we would dump the rest of the file as buckets array because we have a broken nbucket field.
+## No need to check what we dump, we only want to test that we have no unexpected warnings/crashes.
+# NOERR1-NEXT:  Buckets:
+# NOERR1-NEXT:  Chains: [0]
+# NOERR1-NEXT: }
+
+## Case A.2: the hash table ends 1 byte past the EOF. We have a broken nbucket
+##           field that has a value larger than the number of buckets.
+# RUN: yaml2obj --docnum=5 %s -o %t5.2.o -DNBUCKET=0x5e -DNCHAIN=0x1
+# RUN: llvm-readelf --hash-table %t5.2.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2 -DFILE=%t5.2.o --implicit-check-not="warning:"
+# RUN: llvm-readobj --hash-table %t5.2.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2 -DFILE=%t5.2.o --implicit-check-not="warning:"
+
+# ERR2:      HashTable {
+# ERR2-NEXT:  warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 94, nchain = 1{{$}}
+# ERR2-NEXT: }
+
+## Case B.1: the hash table ends right before the EOF. We have a broken nchain
+##           field that has a value larger than the number of chains.
+# RUN: yaml2obj --docnum=5 %s -o %t5.3.o -DNBUCKET=0x1 -DNCHAIN=0x5d
+# RUN: llvm-readelf --hash-table %t5.3.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=NOERR2 -DFILE=%t5.3.o --implicit-check-not="warning:"
+# RUN: llvm-readobj --hash-table %t5.3.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=NOERR2 -DFILE=%t5.3.o --implicit-check-not="warning:"
+
+# NOERR2:      warning: '[[FILE]]': hash table nchain (93) differs from symbol count derived from SHT_DYNSYM section header (1)
+# NOERR2:      HashTable {
+# NOERR2-NEXT:   Num Buckets: 1
+# NOERR2-NEXT:   Num Chains: 93
+# NOERR2-NEXT:   Buckets: [0]
+## Here we would dump the rest of the file as chain array because we have a broken nchain field.
+## No need to check what we dump, we only want to test that we have no unexpected warnings/crashes.
+# NOERR2-NEXT:   Chains:
+# NOERR2-NEXT: }
+
+## Case B.2: the hash table ends 1 byte past the EOF. We have a broken nchain
+##           field that has a value larger than the number of chains.
+# RUN: yaml2obj --docnum=5 %s -o %t5.4.o -DNBUCKET=0x1 -DNCHAIN=0x5e
+# RUN: llvm-readelf --hash-table %t5.4.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR3 -DFILE=%t5.4.o --implicit-check-not="warning:"
+# RUN: llvm-readobj --hash-table %t5.4.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR3 -DFILE=%t5.4.o --implicit-check-not="warning:"
+
+# ERR3:      warning: '[[FILE]]': hash table nchain (94) differs from symbol count derived from SHT_DYNSYM section header (1)
+# ERR3:      HashTable {
+# ERR3-NEXT:  warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 1, nchain = 94{{$}}
+# ERR3-NEXT: }
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .hash
+    Type:    SHT_HASH
+    Flags:   [ SHF_ALLOC ]
+    Bucket:  [ 0 ]
+    NBucket: [[NBUCKET]]
+    Chain:   [ 0 ]
+    NChain:  [[NCHAIN]]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Entries:
+      - Tag:   DT_HASH
+        Value: 0x0
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols: []
+ProgramHeaders:
+  - Type: PT_LOAD
+    Sections:
+      - Section: .hash
+      - Section: .dynamic
diff --git a/llvm/test/tools/yaml2obj/ELF/hash-section.yaml b/llvm/test/tools/yaml2obj/ELF/hash-section.yaml
index 4aad9c11fd59..2274e4c9a7c9 100644
--- a/llvm/test/tools/yaml2obj/ELF/hash-section.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/hash-section.yaml
@@ -276,3 +276,39 @@ Sections:
     Type: SHT_HASH
     Size: 0x1
     Chain: [ 1 ]
+
+## Check we can override "nbucket" and "nchain" values of a SHT_HASH section using "NBucket"
+## and "NChain" tags. Check that the section size is unaffected when we do this.
+
+# RUN: yaml2obj --docnum=14 %s -o %t14
+# RUN: llvm-readobj --sections --section-data %t14 | FileCheck %s --check-prefix=OVERRIDE
+
+# OVERRIDE:      Name: .hash
+# OVERRIDE-NEXT: Type: SHT_HASH
+# OVERRIDE-NEXT: Flags [
+# OVERRIDE-NEXT: ]
+# OVERRIDE-NEXT: Address: 0x0
+# OVERRIDE-NEXT: Offset: 0x34
+# OVERRIDE-NEXT: Size: 28
+# OVERRIDE-NEXT: Link: 0
+# OVERRIDE-NEXT: Info: 0
+# OVERRIDE-NEXT: AddressAlignment: 0
+# OVERRIDE-NEXT: EntrySize: 0
+# OVERRIDE-NEXT: SectionData (
+# OVERRIDE-NEXT:   0000: AA000000 BB000000 01000000 02000000
+# OVERRIDE-NEXT:   0010: 03000000 04000000 05000000
+# OVERRIDE-NEXT: )
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_386
+Sections:
+  - Name:   .hash
+    Type:   SHT_HASH
+    Bucket: [ 1, 2 ]
+    Chain:  [ 3, 4, 5 ]
+    NBucket: 0xAA
+    NChain:  0xBB
diff --git a/llvm/tools/llvm-ifs/llvm-ifs.cpp b/llvm/tools/llvm-ifs/llvm-ifs.cpp
index 3b0d2ee725ff..0d1a7518dad3 100644
--- a/llvm/tools/llvm-ifs/llvm-ifs.cpp
+++ b/llvm/tools/llvm-ifs/llvm-ifs.cpp
@@ -26,6 +26,7 @@
 #include "llvm/TextAPI/MachO/TextAPIWriter.h"
 #include <set>
 #include <string>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::yaml;
@@ -34,8 +35,8 @@ using namespace llvm::MachO;
 #define DEBUG_TYPE "llvm-ifs"
 
 namespace {
-const VersionTuple IFSVersionCurrent(1, 2);
-}
+const VersionTuple IFSVersionCurrent(2, 0);
+} // end anonymous namespace
 
 static cl::opt<std::string> Action("action", cl::desc("<llvm-ifs action>"),
                                    cl::value_desc("write-ifs | write-bin"),
@@ -76,6 +77,7 @@ std::string getTypeName(IFSSymbolType Type) {
 }
 
 struct IFSSymbol {
+  IFSSymbol() = default;
   IFSSymbol(std::string SymbolName) : Name(SymbolName) {}
   std::string Name;
   uint64_t Size;
@@ -85,6 +87,8 @@ struct IFSSymbol {
   bool operator<(const IFSSymbol &RHS) const { return Name < RHS.Name; }
 };
 
+LLVM_YAML_IS_SEQUENCE_VECTOR(IFSSymbol)
+
 namespace llvm {
 namespace yaml {
 /// YAML traits for IFSSymbolType.
@@ -124,6 +128,7 @@ template <> struct ScalarTraits<VersionTuple> {
 /// YAML traits for IFSSymbol.
 template <> struct MappingTraits<IFSSymbol> {
   static void mapping(IO &IO, IFSSymbol &Symbol) {
+    IO.mapRequired("Name", Symbol.Name);
     IO.mapRequired("Type", Symbol.Type);
     // The need for symbol size depends on the symbol type.
     if (Symbol.Type == IFSSymbolType::NoType)
@@ -140,20 +145,6 @@ template <> struct MappingTraits<IFSSymbol> {
   static const bool flow = true;
 };
 
-/// YAML traits for set of IFSSymbols.
-template <> struct CustomMappingTraits<std::set<IFSSymbol>> {
-  static void inputOne(IO &IO, StringRef Key, std::set<IFSSymbol> &Set) {
-    std::string Name = Key.str();
-    IFSSymbol Sym(Name);
-    IO.mapRequired(Name.c_str(), Sym);
-    Set.insert(Sym);
-  }
-
-  static void output(IO &IO, std::set<IFSSymbol> &Set) {
-    for (auto &Sym : Set)
-      IO.mapRequired(Sym.Name.c_str(), const_cast<IFSSymbol &>(Sym));
-  }
-};
 } // namespace yaml
 } // namespace llvm
 
@@ -167,7 +158,7 @@ class IFSStub {
   std::string ObjectFileFormat;
   Optional<std::string> SOName;
   std::vector<std::string> NeededLibs;
-  std::set<IFSSymbol> Symbols;
+  std::vector<IFSSymbol> Symbols;
 
   IFSStub() = default;
   IFSStub(const IFSStub &Stub)
@@ -186,14 +177,18 @@ namespace yaml {
 /// YAML traits for IFSStub objects.
 template <> struct MappingTraits<IFSStub> {
   static void mapping(IO &IO, IFSStub &Stub) {
-    if (!IO.mapTag("!experimental-ifs-v1", true))
+    if (!IO.mapTag("!experimental-ifs-v2", true))
       IO.setError("Not a .ifs YAML file.");
+
+    auto OldContext = IO.getContext();
+    IO.setContext(&Stub);
     IO.mapRequired("IfsVersion", Stub.IfsVersion);
     IO.mapOptional("Triple", Stub.Triple);
     IO.mapOptional("ObjectFileFormat", Stub.ObjectFileFormat);
     IO.mapOptional("SOName", Stub.SOName);
     IO.mapOptional("NeededLibs", Stub.NeededLibs);
     IO.mapRequired("Symbols", Stub.Symbols);
+    IO.setContext(&OldContext);
   }
 };
 } // namespace yaml
@@ -218,7 +213,7 @@ static Expected<std::unique_ptr<IFSStub>> readInputFile(StringRef FilePath) {
   return std::move(Stub);
 }
 
-int writeTbdStub(const llvm::Triple &T, const std::set<IFSSymbol> &Symbols,
+int writeTbdStub(const llvm::Triple &T, const std::vector<IFSSymbol> &Symbols,
                  const StringRef Format, raw_ostream &Out) {
 
   auto PlatformKindOrError =
@@ -280,7 +275,7 @@ int writeTbdStub(const llvm::Triple &T, const std::set<IFSSymbol> &Symbols,
   return 0;
 }
 
-int writeElfStub(const llvm::Triple &T, const std::set<IFSSymbol> &Symbols,
+int writeElfStub(const llvm::Triple &T, const std::vector<IFSSymbol> &Symbols,
                  const StringRef Format, raw_ostream &Out) {
   SmallString<0> Storage;
   Storage.clear();
@@ -387,8 +382,8 @@ int writeIfso(const IFSStub &Stub, bool IsWriteIfs, raw_ostream &Out) {
 
 // TODO: Drop ObjectFileFormat, it can be subsumed from the triple.
 // New Interface Stubs Yaml Format:
-// --- !experimental-ifs-v1
-// IfsVersion:      1.0
+// --- !experimental-ifs-v2
+// IfsVersion: 2.0
 // Triple:          <llvm triple>
 // ObjectFileFormat: <ELF | others not yet supported>
 // Symbols:
@@ -517,7 +512,7 @@ int main(int argc, char *argv[]) {
     }
 
   for (auto &Entry : SymbolMap)
-    Stub.Symbols.insert(Entry.second);
+    Stub.Symbols.push_back(Entry.second);
 
   std::error_code SysErr;
 
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 24f41ae49469..ca05f99aa715 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -52,6 +52,8 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MipsABIFlags.h"
+#include "llvm/Support/RISCVAttributeParser.h"
+#include "llvm/Support/RISCVAttributes.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -2605,9 +2607,35 @@ template <class ELFT> void ELFDumper<ELFT>::printNeededLibraries() {
     W.startLine() << L << "\n";
 }
 
+template <class ELFT>
+static bool checkHashTable(const ELFFile<ELFT> *Obj,
+                           const typename ELFT::Hash *H, StringRef FileName) {
+  auto WarnAndReturn = [&](uint64_t Off, const Twine &Msg = "") {
+    reportWarning(createError("the hash table at offset 0x" +
+                              Twine::utohexstr(Off) +
+                              " goes past the end of the file (0x" +
+                              Twine::utohexstr(Obj->getBufSize()) + ")" + Msg),
+                  FileName);
+    return false;
+  };
+
+  // Each SHT_HASH section starts from two 32-bit fields: nbucket and nchain.
+  const unsigned HeaderSize = 2 * sizeof(typename ELFT::Word);
+  const uint64_t SecOffset = (const uint8_t *)H - Obj->base();
+  if (Obj->getBufSize() - SecOffset < HeaderSize)
+    return WarnAndReturn(SecOffset);
+
+  if (Obj->getBufSize() - SecOffset - HeaderSize <
+      ((uint64_t)H->nbucket + H->nchain) * sizeof(typename ELFT::Word))
+    return WarnAndReturn(SecOffset, ", nbucket = " + Twine(H->nbucket) +
+                                        ", nchain = " + Twine(H->nchain));
+  return true;
+}
+
 template <typename ELFT> void ELFDumper<ELFT>::printHashTable() {
   DictScope D(W, "HashTable");
-  if (!HashTable)
+  if (!HashTable ||
+      !checkHashTable(ObjF->getELFFile(), HashTable, ObjF->getFileName()))
     return;
   W.printNumber("Num Buckets", HashTable->nbucket);
   W.printNumber("Num Chains", HashTable->nchain);
@@ -2678,6 +2706,7 @@ template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   switch (Obj->getHeader()->e_machine) {
   case EM_ARM:
+  case EM_RISCV:
     printAttributes();
     break;
   case EM_MIPS: {
@@ -2698,40 +2727,45 @@ template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   }
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
-  W.startLine() << "Attributes not implemented.\n";
-}
-
 namespace {
 
-template <> void ELFDumper<ELF32LE>::printAttributes() {
-  const ELFFile<ELF32LE> *Obj = ObjF->getELFFile();
-  if (Obj->getHeader()->e_machine != EM_ARM) {
+template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  if (!Obj->isLE()) {
     W.startLine() << "Attributes not implemented.\n";
     return;
   }
 
+  const unsigned Machine = Obj->getHeader()->e_machine;
+  assert((Machine == EM_ARM || Machine == EM_RISCV) &&
+         "Attributes not implemented.");
+
   DictScope BA(W, "BuildAttributes");
-  for (const ELFO::Elf_Shdr &Sec :
-       unwrapOrError(ObjF->getFileName(), Obj->sections())) {
-    if (Sec.sh_type != ELF::SHT_ARM_ATTRIBUTES)
+  for (const auto &Sec : unwrapOrError(ObjF->getFileName(), Obj->sections())) {
+    if (Sec.sh_type != ELF::SHT_ARM_ATTRIBUTES &&
+        Sec.sh_type != ELF::SHT_RISCV_ATTRIBUTES)
       continue;
 
     ArrayRef<uint8_t> Contents =
         unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(&Sec));
     if (Contents[0] != ELFAttrs::Format_Version) {
-      errs() << "unrecognised FormatVersion: 0x"
-             << Twine::utohexstr(Contents[0]) << '\n';
+      reportWarning(createError(Twine("unrecognised FormatVersion: 0x") +
+                                Twine::utohexstr(Contents[0])),
+                    ObjF->getFileName());
       continue;
     }
-
     W.printHex("FormatVersion", Contents[0]);
     if (Contents.size() == 1)
       continue;
 
-    // TODO: Print error and delete the redundant FormatVersion check above.
-    if (Error E = ARMAttributeParser(&W).parse(Contents, support::little))
-      consumeError(std::move(E));
+    // TODO: Delete the redundant FormatVersion check above.
+    if (Machine == EM_ARM) {
+      if (Error E = ARMAttributeParser(&W).parse(Contents, support::little))
+        reportWarning(std::move(E), ObjF->getFileName());
+    } else if (Machine == EM_RISCV) {
+      if (Error E = RISCVAttributeParser(&W).parse(Contents, support::little))
+        reportWarning(std::move(E), ObjF->getFileName());
+    }
   }
 }
 
@@ -3569,6 +3603,11 @@ static std::string getSectionTypeString(unsigned Arch, unsigned Type) {
       return "MIPS_ABIFLAGS";
     }
     break;
+  case EM_RISCV:
+    switch (Type) {
+    case SHT_RISCV_ATTRIBUTES:
+      return "RISCV_ATTRIBUTES";
+    }
   }
   switch (Type) {
   case SHT_NULL:
@@ -3886,9 +3925,7 @@ template <class ELFT> void GNUStyle<ELFT>::printHashSymbols(const ELFO *Obj) {
   auto StringTable = this->dumper()->getDynamicStringTable();
   auto DynSyms = this->dumper()->dynamic_symbols();
 
-  // Try printing .hash
-  if (auto SysVHash = this->dumper()->getHashTable()) {
-    OS << "\n Symbol table of .hash for image:\n";
+  auto PrintHashTable = [&](const Elf_Hash *SysVHash) {
     if (ELFT::Is64Bits)
       OS << "  Num Buc:    Value          Size   Type   Bind Vis      Ndx Name";
     else
@@ -3917,6 +3954,12 @@ template <class ELFT> void GNUStyle<ELFT>::printHashSymbols(const ELFO *Obj) {
         Visited[Ch] = true;
       }
     }
+  };
+
+  if (const Elf_Hash *SysVHash = this->dumper()->getHashTable()) {
+    OS << "\n Symbol table of .hash for image:\n";
+    if (checkHashTable(Obj, SysVHash, this->FileName))
+      PrintHashTable(SysVHash);
   }
 
   // Try printing .gnu.hash
@@ -4439,6 +4482,9 @@ template <class ELFT>
 void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
   // Print histogram for .hash section
   if (const Elf_Hash *HashTable = this->dumper()->getHashTable()) {
+    if (!checkHashTable(Obj, HashTable, this->FileName))
+      return;
+
     size_t NBucket = HashTable->nbucket;
     size_t NChain = HashTable->nchain;
     ArrayRef<Elf_Word> Buckets = HashTable->buckets();
diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index d4de44313a29..8d46bd2cb627 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -195,30 +195,30 @@ static bool parseCommand(StringRef InputString, Command &Cmd,
     // If no cmd, assume it's CODE.
     Cmd = Command::Code;
   }
-  const char *pos = InputString.data();
+  const char *Pos = InputString.data();
   // Skip delimiters and parse input filename (if needed).
   if (ClBinaryName.empty()) {
-    pos += strspn(pos, kDelimiters);
-    if (*pos == '"' || *pos == '\'') {
-      char quote = *pos;
-      pos++;
-      const char *end = strchr(pos, quote);
-      if (!end)
+    Pos += strspn(Pos, kDelimiters);
+    if (*Pos == '"' || *Pos == '\'') {
+      char Quote = *Pos;
+      Pos++;
+      const char *End = strchr(Pos, Quote);
+      if (!End)
         return false;
-      ModuleName = std::string(pos, end - pos);
-      pos = end + 1;
+      ModuleName = std::string(Pos, End - Pos);
+      Pos = End + 1;
     } else {
-      int name_length = strcspn(pos, kDelimiters);
-      ModuleName = std::string(pos, name_length);
-      pos += name_length;
+      int NameLength = strcspn(Pos, kDelimiters);
+      ModuleName = std::string(Pos, NameLength);
+      Pos += NameLength;
     }
   } else {
     ModuleName = ClBinaryName;
   }
   // Skip delimiters and parse module offset.
-  pos += strspn(pos, kDelimiters);
-  int offset_length = strcspn(pos, kDelimiters);
-  return !StringRef(pos, offset_length).getAsInteger(0, ModuleOffset);
+  Pos += strspn(Pos, kDelimiters);
+  int OffsetLength = strcspn(Pos, kDelimiters);
+  return !StringRef(Pos, OffsetLength).getAsInteger(0, ModuleOffset);
 }
 
 static void symbolizeInput(StringRef InputString, LLVMSymbolizer &Symbolizer,
diff --git a/llvm/unittests/Analysis/VectorUtilsTest.cpp b/llvm/unittests/Analysis/VectorUtilsTest.cpp
index d471e79842ca..1a06b0994bc0 100644
--- a/llvm/unittests/Analysis/VectorUtilsTest.cpp
+++ b/llvm/unittests/Analysis/VectorUtilsTest.cpp
@@ -100,10 +100,10 @@ TEST_F(BasicTest, isSplat) {
 
 TEST_F(BasicTest, scaleShuffleMask) {
   SmallVector<int, 16> ScaledMask;
-  scaleShuffleMask<int>(1, {3,2,0,-2}, ScaledMask);
-  EXPECT_EQ(makeArrayRef<int>(ScaledMask), makeArrayRef<int>({3,2,0,-2}));
-  scaleShuffleMask<int>(4, {3,2,0,-1}, ScaledMask);
-  EXPECT_EQ(makeArrayRef<int>(ScaledMask), makeArrayRef<int>({12,13,14,15,8,9,10,11,0,1,2,3,-1,-1,-1,-1}));
+  scaleShuffleMask(1, {3,2,0,-2}, ScaledMask);
+  EXPECT_EQ(makeArrayRef(ScaledMask), makeArrayRef({3,2,0,-2}));
+  scaleShuffleMask(4, {3,2,0,-1}, ScaledMask);
+  EXPECT_EQ(makeArrayRef(ScaledMask), makeArrayRef({12,13,14,15,8,9,10,11,0,1,2,3,-1,-1,-1,-1}));
 }
 
 TEST_F(BasicTest, getSplatIndex) {
diff --git a/llvm/unittests/Support/ELFAttributeParserTest.cpp b/llvm/unittests/Support/ELFAttributeParserTest.cpp
index ad4e309d8953..8234d4ee176f 100644
--- a/llvm/unittests/Support/ELFAttributeParserTest.cpp
+++ b/llvm/unittests/Support/ELFAttributeParserTest.cpp
@@ -40,9 +40,9 @@ TEST(AttributeHeaderParser, UnrecognizedFormatVersion) {
   testParseError(bytes, "unrecognized format-version: 0x1");
 }
 
-TEST(AttributeHeaderParser, InvalidSubsectionLength) {
+TEST(AttributeHeaderParser, InvalidSectionLength) {
   static const uint8_t bytes[] = {'A', 3, 0, 0, 0};
-  testParseError(bytes, "invalid subsection length 3 at offset 0x1");
+  testParseError(bytes, "invalid section length 3 at offset 0x1");
 }
 
 TEST(AttributeHeaderParser, UnrecognizedVendorName) {
diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
index 402fadb12d6c..53ef761302ae 100644
--- a/llvm/utils/lit/lit/cl_arguments.py
+++ b/llvm/utils/lit/lit/cl_arguments.py
@@ -5,18 +5,17 @@
 
 import lit.util
 
+
 def parse_args():
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(prog='lit')
     parser.add_argument('test_paths',
             nargs='+',
             metavar="TEST_PATH",
             help='File or path to include in the test suite')
 
-    parser.add_argument("--version",
-            dest="show_version",
-            help="Show version and exit",
-            version="lit " + lit.__version__,
-            action="version")
+    parser.add_argument('--version',
+            action='version',
+            version='%(prog)s ' + lit.__version__)
 
     parser.add_argument("-j", "--threads", "--workers",
             dest="workers",
@@ -189,12 +188,15 @@ def parse_args():
 
     return opts
 
+
 def _positive_int(arg):
     return _int(arg, 'positive', lambda i: i > 0)
 
+
 def _non_negative_int(arg):
     return _int(arg, 'non-negative', lambda i: i >= 0)
 
+
 def _int(arg, kind, pred):
     desc = "requires {} integer, but found '{}'"
     try:
@@ -205,6 +207,7 @@ def _int(arg, kind, pred):
         raise _error(desc, kind, arg)
     return i
 
+
 def _case_insensitive_regex(arg):
     import re
     try:
@@ -212,6 +215,7 @@ def _case_insensitive_regex(arg):
     except re.error as reason:
         raise _error("invalid regular expression: '{}', {}", arg, reason)
 
+
 def _error(desc, *args):
     msg = desc.format(*args)
     return argparse.ArgumentTypeError(msg)
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
index 4b61c8f37761..8c675c0e5ba4 100755
--- a/llvm/utils/lit/lit/main.py
+++ b/llvm/utils/lit/lit/main.py
@@ -20,7 +20,6 @@
 
 def main(builtin_params={}):
     opts = lit.cl_arguments.parse_args()
-
     params = create_params(builtin_params, opts.user_params)
     is_windows = platform.system() == 'Windows'
 
diff --git a/llvm/utils/lit/tests/usage.py b/llvm/utils/lit/tests/usage.py
index d168c5eff9e3..77b3573c5375 100644
--- a/llvm/utils/lit/tests/usage.py
+++ b/llvm/utils/lit/tests/usage.py
@@ -1,6 +1,7 @@
-# Basic sanity check that usage works.
+# Basic sanity check for `--help` and `--version` options.
 #
-# RUN: %{lit} --help > %t.out
-# RUN: FileCheck < %t.out %s
+# RUN: %{lit} --help         | FileCheck %s --check-prefix=HELP
+# RUN: %{lit} --version 2>&1 | FileCheck %s --check-prefix=VERSION
 #
-# CHECK: usage: lit.py [-h]
+# HELP: usage: lit [-h]
+# VERSION: lit {{[0-9]+\.[0-9]+\.[0-9]+[a-zA-Z0-9]*}}
diff --git a/llvm/utils/lit/tests/version.py b/llvm/utils/lit/tests/version.py
deleted file mode 100644
index 1d5e152ddd3b..000000000000
--- a/llvm/utils/lit/tests/version.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Basic sanity check that --version works.
-#
-# RUN: %{lit} --version 2>&1 | FileCheck %s
-#
-# CHECK: lit {{[0-9]+\.[0-9]+\.[0-9]+[a-zA-Z0-9]*}}
diff --git a/llvm/utils/llvm-locstats/llvm-locstats.py b/llvm/utils/llvm-locstats/llvm-locstats.py
index 7b2f706fca94..dec87f9caf7d 100755
--- a/llvm/utils/llvm-locstats/llvm-locstats.py
+++ b/llvm/utils/llvm-locstats/llvm-locstats.py
@@ -121,14 +121,16 @@ def draw_location_diff(self, locstats_to_compare):
     ax = fig.add_subplot(111)
     init_plot(plt)
 
+    comparison_keys = list(coverage_buckets())
     ax.bar(buckets, self.variables_coverage_map.values(), align='edge',
-           tick_label=self.variables_coverage_map.keys(), width=0.4,
+           width=0.4,
            label='variables of {}'.format(self.file_name))
     ax.bar(buckets_to_compare,
            locstats_to_compare.variables_coverage_map.values(),
            color='r', align='edge', width=-0.4,
-           tick_label=locstats_to_compare.variables_coverage_map.keys(),
            label='variables of {}'.format(locstats_to_compare.file_name))
+    ax.set_xticks(range(len(comparison_keys)))
+    ax.set_xticklabels(comparison_keys)
 
     props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
     plt.text(0.02, 0.88,
diff --git a/mlir/docs/Dialects/Linalg.md b/mlir/docs/Dialects/Linalg.md
index af4db423e44e..878ce8f11523 100644
--- a/mlir/docs/Dialects/Linalg.md
+++ b/mlir/docs/Dialects/Linalg.md
@@ -29,7 +29,7 @@ performed on the Linalg IR and that have influenced its design:
 1. Tiled Producer-Consumer Fusion with Parametric Tile-And-Fuse.
 1. Map to Parallel and Reduction Loops and Hardware.
 1. Vectorization: Rewrite in Vector Form.
-1. Lower to Loops (Affine and/or Generic).
+1. Lower to Loops (Affine, Generic and Parallel).
 1. Lower to Library Calls or Special Instructions, Intrinsics or ISA.
 1. Partially Lower to Iterations Over a Finer-Grained Linalg Op.
 
@@ -241,7 +241,7 @@ example:
   (i, j) -> (i, j),
   (i, j) -> (i, j)
 }
-#attrs = {args_in: 1, args_out: 1, indexings: #indexing_maps}
+#attrs = {args_in: 2, args_out: 1, indexings: #indexing_maps}
 func @example(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
   linalg.generic #attrs (%A, %B, %C) {
     ^bb0(%a: f32, %b: f32):
@@ -295,7 +295,7 @@ example:
   (i, j) -> (i, j),
   (i, j) -> (i, j)
 }
-#attrs = {args_in: 1, args_out: 1, indexings: #indexing_maps, fun: #fun_attr}
+#attrs = {args_in: 2, args_out: 1, indexings: #indexing_maps, fun: #fun_attr}
 func @example(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
   linalg.generic #attrs (%A, %B, %C) {
     ^bb0(%a: f32, %b: f32):
diff --git a/mlir/docs/Dialects/SPIR-V.md b/mlir/docs/Dialects/SPIR-V.md
index 20919c08244b..acf56f137e16 100644
--- a/mlir/docs/Dialects/SPIR-V.md
+++ b/mlir/docs/Dialects/SPIR-V.md
@@ -15,6 +15,8 @@ Vulkan and OpenCL. It is fully defined in a
 [human-readable specification][SpirvSpec]; the syntax of various SPIR-V
 instructions are encoded in a [machine-readable grammar][SpirvGrammar].
 
+[TOC]
+
 ## Design Guidelines
 
 SPIR-V is a binary intermediate language that serves dual purpose: on one side,
@@ -459,8 +461,9 @@ can be represented in the dialect as
 ```
 
 Operation documentation is written in each op's Op Definition Spec using
-TableGen. A markdown version of the doc can be found at
-[mlir.llvm.org][LlvmMlirSpirvDoc] or generated using `mlir-tblgen -gen-doc`.
+TableGen. A markdown version of the doc can be generated using
+`mlir-tblgen -gen-doc` and is attached in the
+[Operation definitions](#operation-definitions) section.
 
 ### Ops from extended instruction sets
 
@@ -1224,6 +1227,10 @@ conversion][MlirDialectConversionSignatureConversion] might be needed as well.
 operations contained within its region are valid operations in the SPIR-V
 dialect.
 
+## Operation definitions
+
+[include "Dialects/SPIRVOps.md"]
+
 [Spirv]: https://www.khronos.org/registry/spir-v/
 [SpirvSpec]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html
 [SpirvLogicalLayout]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#_a_id_logicallayout_a_logical_layout_of_a_module
@@ -1270,7 +1277,6 @@ dialect.
 [GitHubDialectTracking]: https://github.com/tensorflow/mlir/issues/302
 [GitHubLoweringTracking]: https://github.com/tensorflow/mlir/issues/303
 [GenSpirvUtilsPy]: https://github.com/llvm/llvm-project/blob/master/mlir/utils/spirv/gen_spirv_dialect.py
-[LlvmMlirSpirvDoc]: ../Dialects/SPIRVOps/
 [CustomTypeAttrTutorial]: ../DefiningAttributesAndTypes/
 [VulkanSpirv]: https://renderdoc.org/vkspec_chunked/chap40.html#spirvenv
 [VulkanShaderInterface]: https://renderdoc.org/vkspec_chunked/chap14.html#interfaces-resources
diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md
index 231594b82fc7..866f3666d7d0 100644
--- a/mlir/docs/Passes.md
+++ b/mlir/docs/Passes.md
@@ -4,295 +4,46 @@ This document describes the available MLIR passes and their contracts.
 
 [TOC]
 
-## Affine dialect lowering (`-lower-affine`)
+## General Transformation Passes
 
-Convert operations from the affine dialect into operations from the loop and
-standard dialects.
+[include "GeneralPasses.md"]
 
-`affine.for` operations are converted to `loop.for` operations that are free of
-certain structural restrictions (on their bounds and step). `affine.if` is
-similarly converted to the `loop.if` operation. `affine.apply` operations are
-converted into sequences of primitive arithmetic operations from the standard
-dialect that have the same effect, using operands of the `index` type.
-Consequently, named maps and sets thare are no longer in use may be removed from
-the module.
+## Conversion Passes
 
-For example, `%r = affine.apply affine_map<(d0, d1)[s0] -> (d0 + 2*d1 +
-s0)>(%d0, %d1)[%s0]`
-can be converted into:
+[include "ConversionPasses.md"]
 
-```mlir
-%d0 = <...>
-%d1 = <...>
-%s0 = <...>
-%0 = constant 2 : index
-%1 = muli %0, %d1
-%2 = addi %d0, %1
-%r = addi %2, %s0
-```
+## Quantizer Passes
 
-### Input invariant
+[include "QuantizerPasses.md"]
 
--   no `Tensor` types;
+## `affine` Dialect Passes
 
-These restrictions may be lifted in the future.
+[include "AffinePasses.md"]
 
-### Output IR
+## `fxpmath` Dialect Passes
 
-Functions with `affine.for` and `affine.if` operations eliminated. These
-functions may contain operations from the Standard dialect in addition to those
-already present before the pass.
+[include "FxpMathPasses.md"]
 
-### Invariants
+## `gpu` Dialect Passes
 
--   Functions without a body are not modified.
--   The semantics of the other functions is preserved.
--   Individual operations other than those mentioned above are not modified if
-    they do not depend on the loop iterator value or on the result of
-    `affine.apply`.
+[include "GPUPasses.md"]
 
-## Conversion from Standard to LLVM IR dialect (`-convert-std-to-llvm`)
+## `linalg` Dialect Passes
 
-Convert standard operations into the LLVM IR dialect operations.
+[include "LinalgPasses.md"]
 
-### Input invariant
+## `llvm` Dialect Passes
 
--   operations including: arithmetic on integers and floats, constants, direct
-    calls, returns and branches;
--   no `tensor` types;
--   all `vector` are one-dimensional;
--   all blocks are reachable by following the successors of the first basic
-    block;
+[include "LLVMPasses.md"]
 
-If other operations are present and their results are required by the LLVM IR
-dialect operations, the pass will fail.  Any LLVM IR operations or types already
-present in the IR will be kept as is.
+## `loop` Dialect Passes
 
-### Output IR
+[include "LoopPasses.md"]
 
-Functions converted to LLVM IR. Function arguments types are converted
-one-to-one. Function results are converted one-to-one and, in case more than 1
-value is returned, packed into an LLVM IR struct type. Function calls and
-returns are updated accordingly. Block argument types are updated to use LLVM IR
-types.
+## `quant` Dialect Passes
 
-## Data Copy DMA generation (`-affine-data-copy-generate`)
+[include "QuantPasses.md"]
 
-Replaces all loads and stores on memref's living in 'slowMemorySpace' by
-introducing DMA operations (strided DMA if necessary) to transfer data to/from
-`fastMemorySpace` and rewriting the original load's/store's to instead
-load/store from the allocated fast memory buffers. Additional options specify
-the identifier corresponding to the fast memory space and the amount of fast
-memory space available. The pass traverses through the nesting structure,
-recursing to inner levels if necessary to determine at what depth DMA transfers
-need to be placed so that the allocated buffers fit within the memory capacity
-provided. If this is not possible (for example, when the elemental type itself
-is of size larger than the DMA capacity), an error with location information is
-emitted. The DMA transfers are also hoisted up past all loops with respect to
-which the transfers are invariant.
+## `spv` Dialect Passes
 
-Input
-
-```mlir
-func @loop_nest_tiled() -> memref<256x1024xf32> {
-  %0 = alloc() : memref<256x1024xf32>
-  affine.for %i0 = 0 to 256 step 32 {
-    affine.for %i1 = 0 to 1024 step 32 {
-      affine.for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 32)(%i0) {
-        affine.for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) {
-          %1 = affine.load %0[%i2, %i3] : memref<256x1024xf32>
-        }
-      }
-    }
-  }
-  return %0 : memref<256x1024xf32>
-}
-```
-
-Output (with flags: -affine-data-copy-generate -affine-data-copy-generate-fast-mem-space=2)
-
-```mlir
-module {
-  func @loop_nest_tiled() -> memref<256x1024xf32> {
-    %c262144 = constant 262144 : index
-    %c0 = constant 0 : index
-    %0 = alloc() : memref<256x1024xf32>
-    %1 = alloc() : memref<256x1024xf32, 2>
-    %2 = alloc() : memref<1xi32>
-    affine.dma_start %0[%c0, %c0], %1[%c0, %c0], %2[%c0], %c262144 : memref<256x1024xf32>, memref<256x1024xf32, 2>, memref<1xi32>
-    affine.dma_wait %2[%c0], %c262144 : memref<1xi32>
-    affine.for %arg0 = 0 to 256 step 32 {
-      affine.for %arg1 = 0 to 1024 step 32 {
-        affine.for %arg2 = #map1(%arg0) to #map2(%arg0) {
-          affine.for %arg3 = #map1(%arg1) to #map2(%arg1) {
-            %3 = affine.load %1[%arg2, %arg3] : memref<256x1024xf32, 2>
-          }
-        }
-      }
-    }
-    dealloc %2 : memref<1xi32>
-    dealloc %1 : memref<256x1024xf32, 2>
-    return %0 : memref<256x1024xf32>
-  }
-}
-```
-
-## Loop tiling (`-affine-loop-tile`)
-
-Performs tiling or blocking of loop nests. It currently works on perfect loop
-nests.
-
-## Loop unroll (`-affine-loop-unroll`)
-
-This pass implements loop unrolling. It is able to unroll loops with arbitrary
-bounds, and generate a cleanup loop when necessary.
-
-## Loop unroll and jam (`-affine-loop-unroll-jam`)
-
-This pass implements unroll and jam for loops. It works on both perfect or
-imperfect loop nests.
-
-## Loop fusion (`-affine-loop-fusion`)
-
-Performs fusion of loop nests using a slicing-based approach. The fused loop
-nests, when possible, are rewritten to access significantly smaller local
-buffers instead of the original memref's, and the latter are often
-either completely optimized away or contracted. This transformation leads to
-enhanced locality and lower memory footprint through the elimination or
-contraction of temporaries / intermediate memref's. These benefits are sometimes
-achieved at the expense of redundant computation through a cost model that
-evaluates available choices such as the depth at which a source slice should be
-materialized in the designation slice.
-
-## Memref bound checking (`-memref-bound-check`)
-
-Checks all load's and store's on memref's for out of bound accesses, and reports
-any out of bound accesses (both overrun and underrun) with location information.
-
-```mlir
-test/Transforms/memref-bound-check.mlir:19:13: error: 'load' op memref out of upper bound access along dimension #2
-      %x  = load %A[%idx0, %idx1] : memref<9 x 9 x i32>
-            ^
-test/Transforms/memref-bound-check.mlir:19:13: error: 'load' op memref out of lower bound access along dimension #2
-      %x  = load %A[%idx0, %idx1] : memref<9 x 9 x i32>
-            ^
-```
-
-## Memref dataflow optimization (`-memref-dataflow-opt`)
-
-This pass performs store to load forwarding for memref's to eliminate memory
-accesses and potentially the entire memref if all its accesses are forwarded.
-
-Input
-
-```mlir
-func @store_load_affine_apply() -> memref<10x10xf32> {
-  %cf7 = constant 7.0 : f32
-  %m = alloc() : memref<10x10xf32>
-  affine.for %i0 = 0 to 10 {
-    affine.for %i1 = 0 to 10 {
-      affine.store %cf7, %m[%i0, %i1] : memref<10x10xf32>
-      %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32>
-      %v1 = addf %v0, %v0 : f32
-    }
-  }
-  return %m : memref<10x10xf32>
-}
-```
-
-Output
-
-```mlir
-module {
-  func @store_load_affine_apply() -> memref<10x10xf32> {
-    %cst = constant 7.000000e+00 : f32
-    %0 = alloc() : memref<10x10xf32>
-    affine.for %arg0 = 0 to 10 {
-      affine.for %arg1 = 0 to 10 {
-        affine.store %cst, %0[%arg0, %arg1] : memref<10x10xf32>
-        %1 = addf %cst, %cst : f32
-      }
-    }
-    return %0 : memref<10x10xf32>
-  }
-}
-
-```
-
-## Memref dependence analysis (`-memref-dependence-check`)
-
-This pass performs dependence analysis to determine dependences between pairs of
-memory operations (load's and store's) on memref's. Dependence analysis exploits
-polyhedral information available (affine maps, expressions, and affine.apply
-operations) to precisely represent dependences using affine constraints, while
-also computing dependence vectors from them, where each component of the
-dependence vector provides a lower and an upper bound on the dependence distance
-along the corresponding dimension.
-
-```mlir
-test/Transforms/memref-dataflow-opt.mlir:232:7: note: dependence from 2 to 1 at depth 1 = ([1, 1], [-inf, +inf])
-      store %cf9, %m[%idx] : memref<10xf32>
-```
-
-## Pipeline data transfer (`-affine-pipeline-data-transfer`)
-
-This pass performs a transformation to overlap non-blocking DMA operations in a
-loop with computations through double buffering. This is achieved by advancing
-dma_start operations with respect to other operations.
-
-Input
-
-```mlir
-func @pipelinedatatransfer() {
-  %0 = alloc() : memref<256xf32>
-  %1 = alloc() : memref<32xf32, 1>
-  %2 = alloc() : memref<1xf32>
-  %c0 = constant 0 : index
-  %c128 = constant 128 : index
-  affine.for %i0 = 0 to 8 {
-    affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
-    affine.dma_wait %2[%c0], %c128 : memref<1xf32>
-    %3 = affine.load %1[%i0] : memref<32xf32, 1>
-    %4 = "compute"(%3) : (f32) -> f32
-    affine.store %4, %1[%i0] : memref<32xf32, 1>
-  }
-  return
-}
-```
-
-Output
-
-```mlir
-module {
-  func @pipelinedatatransfer() {
-    %c8 = constant 8 : index
-    %c0 = constant 0 : index
-    %0 = alloc() : memref<256xf32>
-    %c0_0 = constant 0 : index
-    %c128 = constant 128 : index
-    %1 = alloc() : memref<2x32xf32, 1>
-    %2 = alloc() : memref<2x1xf32>
-    affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-    affine.for %arg0 = 1 to 8 {
-      affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-      %8 = affine.apply #map3(%arg0)
-      %9 = affine.apply #map4(%8)
-      %10 = affine.apply #map4(%8)
-      affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
-      %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
-      %12 = "compute"(%11) : (f32) -> f32
-      affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
-    }
-    %3 = affine.apply #map3(%c8)
-    %4 = affine.apply #map4(%3)
-    %5 = affine.apply #map4(%3)
-    affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
-    %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
-    %7 = "compute"(%6) : (f32) -> f32
-    affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
-    dealloc %2 : memref<2x1xf32>
-    dealloc %1 : memref<2x32xf32, 1>
-    return
-  }
-}
-```
+[include "SPIRVPasses.md"]
diff --git a/mlir/include/mlir/Analysis/LoopAnalysis.h b/mlir/include/mlir/Analysis/LoopAnalysis.h
index 5141df736a9c..7ed19ef99f87 100644
--- a/mlir/include/mlir/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Analysis/LoopAnalysis.h
@@ -82,7 +82,7 @@ bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim,
 /// 'def' and all its uses have the same shift factor.
 // TODO(mlir-team): extend this to check for memory-based dependence
 // violation when we have the support.
-bool isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts);
+bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts);
 } // end namespace mlir
 
 #endif // MLIR_ANALYSIS_LOOP_ANALYSIS_H
diff --git a/mlir/include/mlir/CMakeLists.txt b/mlir/include/mlir/CMakeLists.txt
index 4754391dc39f..594dc6180f50 100644
--- a/mlir/include/mlir/CMakeLists.txt
+++ b/mlir/include/mlir/CMakeLists.txt
@@ -1,3 +1,6 @@
+add_subdirectory(Conversion)
 add_subdirectory(Dialect)
 add_subdirectory(IR)
 add_subdirectory(Interfaces)
+add_subdirectory(Quantizer)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Conversion/CMakeLists.txt b/mlir/include/mlir/Conversion/CMakeLists.txt
new file mode 100644
index 000000000000..d4ce2634f450
--- /dev/null
+++ b/mlir/include/mlir/Conversion/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRConversionPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc ConversionPasses ./)
diff --git a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h
index d5f48d29ea6c..049e8538d746 100644
--- a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h
+++ b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h
@@ -27,6 +27,7 @@ class Pass;
 /// calling the conversion.
 std::unique_ptr<OpPassBase<FuncOp>>
 createSimpleLoopsToGPUPass(unsigned numBlockDims, unsigned numThreadDims);
+std::unique_ptr<OpPassBase<FuncOp>> createSimpleLoopsToGPUPass();
 
 /// Create a pass that converts every loop operation within the body of the
 /// FuncOp into a GPU launch. The number of workgroups and workgroup size for
@@ -37,6 +38,7 @@ createSimpleLoopsToGPUPass(unsigned numBlockDims, unsigned numThreadDims);
 std::unique_ptr<OpPassBase<FuncOp>>
 createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
                     ArrayRef<int64_t> workGroupSize);
+std::unique_ptr<OpPassBase<FuncOp>> createLoopToGPUPass();
 
 /// Creates a pass that converts loop.parallel operations into a gpu.launch
 /// operation. The mapping of loop dimensions to launch dimensions is derived
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
new file mode 100644
index 000000000000..5553655fafae
--- /dev/null
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -0,0 +1,267 @@
+//===-- Passes.td - Conversion pass definition file --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_PASSES
+#define MLIR_CONVERSION_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+//===----------------------------------------------------------------------===//
+// AffineToStandard
+//===----------------------------------------------------------------------===//
+
+def ConvertAffineToStandard : Pass<"lower-affine"> {
+  let summary = "Lower Affine operations to a combination of Standard and Loop "
+                "operations";
+  let description = [{
+
+    Convert operations from the affine dialect into operations from the loop and
+    standard dialects.
+
+    `affine.for` operations are converted to `loop.for` operations that are free
+    of certain structural restrictions (on their bounds and step). `affine.if`
+    is similarly converted to the `loop.if` operation. `affine.apply` operations
+    are converted into sequences of primitive arithmetic operations from the
+    standard dialect that have the same effect, using operands of the `index`
+    type. Consequently, named maps and sets thare are no longer in use may be
+    removed from the module.
+
+    For example, `%r = affine.apply affine_map<(d0, d1)[s0] -> (d0 + 2*d1 +
+    s0)>(%d0, %d1)[%s0]`
+    can be converted into:
+
+    ```mlir
+    %d0 = <...>
+    %d1 = <...>
+    %s0 = <...>
+    %0 = constant 2 : index
+    %1 = muli %0, %d1
+    %2 = addi %d0, %1
+    %r = addi %2, %s0
+    ```
+
+    #### Input invariant
+
+    -   no `Tensor` types;
+
+    These restrictions may be lifted in the future.
+
+    #### Output IR
+
+    Functions with `affine.for` and `affine.if` operations eliminated. These
+    functions may contain operations from the Standard dialect in addition to
+    those already present before the pass.
+
+    #### Invariants
+
+    -   Functions without a body are not modified.
+    -   The semantics of the other functions is preserved.
+    -   Individual operations other than those mentioned above are not modified
+        if they do not depend on the loop iterator value or on the result of
+        `affine.apply`.
+  }];
+  let constructor = "mlir::createLowerAffinePass()";
+}
+
+//===----------------------------------------------------------------------===//
+// AVX512ToLLVM
+//===----------------------------------------------------------------------===//
+
+def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm"> {
+  let summary = "Convert the operations from the avx512 dialect into the LLVM "
+                "dialect";
+  let constructor = "mlir::createConvertAVX512ToLLVMPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// GPUToCUDA
+//===----------------------------------------------------------------------===//
+
+def ConvertGpuLaunchFuncToCudaCalls : Pass<"launch-func-to-cuda"> {
+  let summary = "Convert all launch_func ops to CUDA runtime calls";
+  let constructor = "mlir::createConvertGpuLaunchFuncToCudaCallsPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// GPUToNVVM
+//===----------------------------------------------------------------------===//
+
+def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm"> {
+  let summary = "Generate NVVM operations for gpu operations";
+  let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// GPUToROCDL
+//===----------------------------------------------------------------------===//
+
+def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl"> {
+  let summary = "Generate ROCDL operations for gpu operations";
+  let constructor = "mlir::createLowerGpuOpsToROCDLOpsPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// GPUToSPIRV
+//===----------------------------------------------------------------------===//
+
+def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv"> {
+  let summary = "Convert GPU dialect to SPIR-V dialect";
+  let constructor = "mlir::createConvertGPUToSPIRVPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// GPUToVulkan
+//===----------------------------------------------------------------------===//
+
+def ConvertGpuLaunchFuncToVulkanLaunchFunc
+    : Pass<"convert-gpu-launch-to-vulkan-launch"> {
+  let summary = "Convert gpu.launch_func to vulkanLaunch external call";
+  let constructor = "mlir::createConvertGpuLaunchFuncToVulkanLaunchFuncPass()";
+}
+
+def ConvertVulkanLaunchFuncToVulkanCalls : Pass<"launch-func-to-vulkan"> {
+  let summary = "Convert vulkanLaunch external call to Vulkan runtime external "
+                "calls";
+  let constructor = "mlir::createConvertVulkanLaunchFuncToVulkanCallsPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// LinalgToLLVM
+//===----------------------------------------------------------------------===//
+
+def ConvertLinalgToLLVM : Pass<"convert-linalg-to-llvm"> {
+  let summary = "Convert the operations from the linalg dialect into the LLVM "
+                "dialect";
+  let constructor = "mlir::createConvertLinalgToLLVMPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// LinalgToSPIRV
+//===----------------------------------------------------------------------===//
+
+def ConvertLinalgToSPIRV : Pass<"convert-linalg-to-spirv"> {
+  let summary = "Convert Linalg ops to SPIR-V ops";
+  let constructor = "mlir::createLinalgToSPIRVPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// LoopToStandard
+//===----------------------------------------------------------------------===//
+
+def ConvertLoopToStandard : Pass<"convert-loop-to-std"> {
+  let summary = "Convert Loop dialect to Standard dialect, replacing structured"
+                " control flow with a CFG";
+  let constructor = "mlir::createLowerToCFGPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// LoopsToGPU
+//===----------------------------------------------------------------------===//
+
+def ConvertSimpleLoopsToGPU : Pass<"convert-loops-to-gpu"> {
+  let summary = "Convert top-level loops to GPU kernels";
+  let constructor = "mlir::createSimpleLoopsToGPUPass()";
+  let options = [
+    Option<"numBlockDims", "gpu-block-dims", "unsigned", /*default=*/"1u",
+           "Number of GPU block dimensions for mapping">,
+    Option<"numThreadDims", "gpu-thread-dims", "unsigned", /*default=*/"1u",
+           "Number of GPU thread dimensions for mapping">
+  ];
+}
+
+def ConvertLoopsToGPU : Pass<"convert-loop-op-to-gpu"> {
+  let summary = "Convert top-level loop::ForOp to GPU kernels";
+  let constructor = "mlir::createLoopToGPUPass()";
+  let options = [
+    ListOption<"numWorkGroups", "gpu-num-workgroups", "int64_t",
+               "Num workgroups in the GPU launch",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"workGroupSize", "gpu-workgroup-size", "int64_t",
+               "Workgroup Size in the GPU launch",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+  ];
+}
+
+def ConvertParallelLoopToGpu : Pass<"convert-parallel-loops-to-gpu"> {
+  let summary = "Convert mapped loop.parallel ops to gpu launch operations";
+  let constructor = "mlir::createParallelLoopToGpuPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// StandardToLLVM
+//===----------------------------------------------------------------------===//
+
+def ConvertStandardToLLVM : Pass<"convert-std-to-llvm"> {
+  let summary = "Convert scalar and vector operations from the Standard to the "
+                "LLVM dialect";
+  let description = [{
+    Convert standard operations into the LLVM IR dialect operations.
+
+    #### Input invariant
+
+    -   operations including: arithmetic on integers and floats, constants,
+        direct calls, returns and branches;
+    -   no `tensor` types;
+    -   all `vector` are one-dimensional;
+    -   all blocks are reachable by following the successors of the first basic
+        block;
+
+    If other operations are present and their results are required by the LLVM
+    IR dialect operations, the pass will fail.  Any LLVM IR operations or types
+    already present in the IR will be kept as is.
+
+    #### Output IR
+
+    Functions converted to LLVM IR. Function arguments types are converted
+    one-to-one. Function results are converted one-to-one and, in case more than
+    1 value is returned, packed into an LLVM IR struct type. Function calls and
+    returns are updated accordingly. Block argument types are updated to use
+    LLVM IR types.
+  }];
+  let constructor = "mlir::createLowerToLLVMPass()";
+  let options = [
+    Option<"useAlloca", "use-alloca", "bool", /*default=*/"false",
+           "Use `alloca` instead of `call @malloc` for converting std.alloc">,
+    Option<"useBarePtrCallConv", "use-bare-ptr-memref-call-conv", "bool",
+           /*default=*/"false",
+           "Replace FuncOp's MemRef arguments with bare pointers to the MemRef "
+           "element types">,
+    Option<"emitCWrappers", "emit-c-wrappers", "bool", /*default=*/"false",
+           "Emit wrappers for C-compatible pointer-to-struct memref "
+           "descriptors">,
+    Option<"indexBitwidth", "index-bitwidth", "unsigned",
+           /*default=*/"kDeriveIndexBitwidthFromDataLayout",
+           "Bitwidth of the index type, 0 to use size of machine word">,
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// StandardToSPIRV
+//===----------------------------------------------------------------------===//
+
+def LegalizeStandardForSPIRV : Pass<"legalize-std-for-spirv"> {
+  let summary = "Legalize standard ops for SPIR-V lowering";
+  let constructor = "mlir::createLegalizeStdOpsForSPIRVLoweringPass()";
+}
+
+def ConvertStandardToSPIRV : Pass<"convert-std-to-spirv"> {
+  let summary = "Convert Standard Ops to SPIR-V dialect";
+  let constructor = "mlir::createConvertStandardToSPIRVPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// VectorToLLVM
+//===----------------------------------------------------------------------===//
+
+def ConvertVectorToLLVM : Pass<"convert-vector-to-llvm"> {
+  let summary = "Lower the operations from the vector dialect into the LLVM "
+                "dialect";
+  let constructor = "mlir::createConvertVectorToLLVMPass()";
+}
+
+#endif // MLIR_CONVERSION_PASSES
diff --git a/mlir/include/mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h b/mlir/include/mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h
index 5c45a3c32ca0..e96b7cfe76e5 100644
--- a/mlir/include/mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h
+++ b/mlir/include/mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h
@@ -5,23 +5,18 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 #ifndef MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLOOPS_H_
 #define MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLOOPS_H_
 
-#include "mlir/Transforms/DialectConversion.h"
-
 namespace mlir {
 class MLIRContext;
-class ModuleOp;
-template <typename T> class OpPassBase;
+class OwningRewritePatternList;
 
 /// Collect a set of patterns to convert from the Vector dialect to loops + std.
 void populateVectorToAffineLoopsConversionPatterns(
     MLIRContext *context, OwningRewritePatternList &patterns);
 
-/// Create a pass to convert vector operations to affine loops + std dialect.
-OpPassBase<ModuleOp> *createLowerVectorToLoopsPass();
-
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLOOPS_H_
diff --git a/mlir/include/mlir/Dialect/Affine/CMakeLists.txt b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt
index f33061b2d87c..404c926f60ed 100644
--- a/mlir/include/mlir/Dialect/Affine/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt
@@ -1 +1,7 @@
 add_subdirectory(IR)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRAffinePassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc AffinePasses ./)
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
index 735c6c0360f5..75ff4a33649d 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -42,10 +42,14 @@ std::unique_ptr<OpPassBase<FuncOp>> createAffineDataCopyGenerationPass(
     unsigned slowMemorySpace, unsigned fastMemorySpace,
     unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024,
     uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
+/// Overload relying on pass options for initialization.
+std::unique_ptr<OpPassBase<FuncOp>> createAffineDataCopyGenerationPass();
 
 /// Creates a pass to perform tiling on loop nests.
 std::unique_ptr<OpPassBase<FuncOp>>
 createLoopTilingPass(uint64_t cacheSizeBytes);
+/// Overload relying on pass options for initialization.
+std::unique_ptr<OpPassBase<FuncOp>> createLoopTilingPass();
 
 /// Creates a loop unrolling pass with the provided parameters.
 /// 'getUnrollFactor' is a function callback for clients to supply a function
@@ -67,6 +71,8 @@ createLoopUnrollAndJamPass(int unrollJamFactor = -1);
 /// target-independent, n-D super-vector abstraction.
 std::unique_ptr<OpPassBase<FuncOp>>
 createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize);
+/// Overload relying on pass options for initialization.
+std::unique_ptr<OpPassBase<FuncOp>> createSuperVectorizePass();
 
 } // end namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
new file mode 100644
index 000000000000..4ae53571d1f4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -0,0 +1,70 @@
+//===-- Passes.td - Affine pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions for passes within the Affine/ directory.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AFFINE_PASSES
+#define MLIR_DIALECT_AFFINE_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def AffineDataCopyGeneration : Pass<"affine-data-copy-generate"> {
+  let summary = "Generate explicit copying for affine memory operations";
+  let constructor = "mlir::createAffineDataCopyGenerationPass()";
+}
+
+def AffineLoopInvariantCodeMotion : Pass<"affine-loop-invariant-code-motion"> {
+  let summary = "Hoist loop invariant instructions outside of affine loops";
+  let constructor = "mlir::createAffineLoopInvariantCodeMotionPass()";
+}
+
+def AffineLoopTiling : Pass<"affine-loop-tile"> {
+  let summary = "Tile affine loop nests";
+  let constructor = "mlir::createLoopTilingPass()";
+}
+
+def AffineLoopUnroll : Pass<"affine-loop-unroll"> {
+  let summary = "Unroll affine loops";
+  let constructor = "mlir::createLoopUnrollPass()";
+}
+
+def AffineLoopUnrollAndJam : Pass<"affine-loop-unroll-jam"> {
+  let summary = "Unroll and jam affine loops";
+  let constructor = "mlir::createLoopUnrollAndJamPass()";
+}
+
+def AffineVectorize : Pass<"affine-super-vectorize"> {
+  let summary = "Vectorize to a target independent n-D vector abstraction";
+  let constructor = "mlir::createSuperVectorizePass()";
+  let options = [
+    ListOption<"vectorSizes", "virtual-vector-size", "int64_t",
+               "Specify an n-D virtual vector size for vectorization",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    // Optionally, the fixed mapping from loop to fastest varying MemRef
+    // dimension for all the MemRefs within a loop pattern:
+    //   the index represents the loop depth, the value represents the k^th
+    //   fastest varying memory dimension.
+    // This is voluntarily restrictive and is meant to precisely target a
+    // particular loop/op pair, for testing purposes.
+    ListOption<"fastestVaryingPattern", "test-fastest-varying", "int64_t",
+               "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
+               "dimensions to match. See defaultPatterns in Vectorize.cpp for "
+               "a description and examples. This is used for testing purposes",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+  ];
+}
+
+def SimplifyAffineStructures : Pass<"simplify-affine-structures"> {
+  let summary = "Simplify affine expressions in maps/sets and normalize "
+                "memrefs";
+  let constructor = "mlir::createSimplifyAffineStructuresPass()";
+}
+
+#endif // MLIR_DIALECT_AFFINE_PASSES
diff --git a/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt
index 2a493d6c1b20..b2d6dc660dbb 100644
--- a/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt
@@ -1,2 +1,8 @@
 add_mlir_dialect(FxpMathOps fxpmath)
 add_mlir_doc(FxpMathOps -gen-dialect-doc FxpMathDialect Dialects/)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRFxpMathPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc FxpMathPasses ./)
diff --git a/mlir/include/mlir/Dialect/FxpMathOps/Passes.h b/mlir/include/mlir/Dialect/FxpMathOps/Passes.h
index 1039bcc9cb34..bec2e74f095f 100644
--- a/mlir/include/mlir/Dialect/FxpMathOps/Passes.h
+++ b/mlir/include/mlir/Dialect/FxpMathOps/Passes.h
@@ -13,6 +13,8 @@
 #ifndef MLIR_DIALECT_FXPMATHOPS_PASSES_H
 #define MLIR_DIALECT_FXPMATHOPS_PASSES_H
 
+#include <memory>
+
 namespace mlir {
 class FuncOp;
 template <typename T> class OpPassBase;
@@ -23,11 +25,11 @@ namespace fxpmath {
 /// arithmetic. This will leave unrecognized real math ops as-is and is
 /// typically followed by a pass that lowers any unrecognized ops to a pure
 /// floating point form.
-OpPassBase<FuncOp> *createLowerUniformRealMathPass();
+std::unique_ptr<OpPassBase<FuncOp>> createLowerUniformRealMathPass();
 
 /// Creates a pass that lowers uniform-quantized qcast/dcast ops to equivalent
 /// operations that perform quantize/dequantize.
-OpPassBase<FuncOp> *createLowerUniformCastsPass();
+std::unique_ptr<OpPassBase<FuncOp>> createLowerUniformCastsPass();
 
 } // namespace fxpmath
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/FxpMathOps/Passes.td b/mlir/include/mlir/Dialect/FxpMathOps/Passes.td
new file mode 100644
index 000000000000..254d200029f5
--- /dev/null
+++ b/mlir/include/mlir/Dialect/FxpMathOps/Passes.td
@@ -0,0 +1,24 @@
+//===-- Passes.td - FxpMath pass definition file -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_FXPMATH_PASSES
+#define MLIR_DIALECT_FXPMATH_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def FxpMathLowerUniformCasts : Pass<"fxpmath-lower-uniform-casts"> {
+  let summary = "Lowers uniform-quantized casts";
+  let constructor = "mlir::fxpmath::createLowerUniformCastsPass()";
+}
+
+def FxpMathLowerUniformRealMath : Pass<"fxpmath-lower-uniform-real-math"> {
+  let summary = "Lowers uniform-quantized real math ops to integer arithmetic";
+  let constructor = "mlir::fxpmath::createLowerUniformRealMathPass()";
+}
+
+#endif // MLIR_DIALECT_FXPMATH_PASSES
diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
index 8151c82f43d1..6c80b4c8e3b9 100644
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -10,3 +10,9 @@ set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
 mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls)
 mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRGPUPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc GPUPasses ./)
diff --git a/mlir/include/mlir/Dialect/GPU/Passes.td b/mlir/include/mlir/Dialect/GPU/Passes.td
new file mode 100644
index 000000000000..563624308297
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Passes.td
@@ -0,0 +1,19 @@
+//===-- Passes.td - GPU pass definition file ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PASSES
+#define MLIR_DIALECT_GPU_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def GpuKernelOutlining : Pass<"gpu-kernel-outlining"> {
+  let summary = "Outline gpu.launch bodies to kernel functions";
+  let constructor = "mlir::createGpuKernelOutliningPass()";
+}
+
+#endif // MLIR_DIALECT_GPU_PASSES
diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
index d7e581b1b949..cc4fd1bafc72 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(Transforms)
+
 set(LLVM_TARGET_DEFINITIONS LLVMOps.td)
 mlir_tablegen(LLVMOps.h.inc -gen-op-decls)
 mlir_tablegen(LLVMOps.cpp.inc -gen-op-defs)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/Transforms/CMakeLists.txt
new file mode 100644
index 000000000000..a2fd81c23e11
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRLLVMPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc LLVMPasses ./)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
new file mode 100644
index 000000000000..0dc193e794f5
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
@@ -0,0 +1,19 @@
+//===-- Passes.td - LLVM pass definition file --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES
+#define MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def LLVMLegalizeForExport : Pass<"llvm-legalize-for-export"> {
+  let summary = "Legalize LLVM dialect to be convertible to LLVM IR";
+  let constructor = "mlir::LLVM::createLegalizeForExportPass()";
+}
+
+#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h b/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
index 34de176a998e..e40d63661b77 100644
--- a/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
+++ b/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
@@ -63,6 +63,7 @@ class LinalgDependenceGraph {
   using dependence_range = iterator_range<dependence_iterator>;
 
   enum DependenceType { RAR = 0, RAW, WAR, WAW, NumTypes };
+  static StringRef getDependenceTypeStr(DependenceType depType);
 
   // Builds a linalg dependence graph for the ops of type LinalgOp under `f`.
   static LinalgDependenceGraph buildDependenceGraph(Aliases &aliases, FuncOp f);
diff --git a/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt
index 9f57627c321f..076c2dfbccb5 100644
--- a/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt
@@ -1,2 +1,8 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRLinalgPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc LinalgPasses ./)
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
index 7756a08d5cb2..77d9d9fc2631 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
@@ -29,6 +29,9 @@ namespace mlir {
 namespace linalg {
 
 class ConvOp;
+class PoolingMaxOp;
+class PoolingMinOp;
+class PoolingSumOp;
 
 /// Returns the name mangled library call name to disambiguate between different
 /// overloads at the C level. The name mangling scheme is basic and uses MLIR
@@ -60,12 +63,13 @@ std::string generateLibraryCallName(Operation *op);
 SmallVector<AffineExpr, 4> makeAffineDimExprs(unsigned num, unsigned &startIdx,
                                               MLIRContext *context);
 
-/// Builds the indexing expressions for a ConvOp `op`. Returns the vector of
-/// AffineMaps representing:
-///   `stride[i] * xs[i] + dilation[i] * zs[i] - pad_low[i]`
-SmallVector<AffineExpr, 4> weightedConvInputIndex(ConvOp op,
-                                                  ArrayRef<AffineExpr> xs,
-                                                  ArrayRef<AffineExpr> zs);
+/// Builds the indexing expressions for a ConvOp/PoolingOp `op`. Returns the
+/// vector of AffineMaps representing:
+///   `stride[i] * outputDims[i] + dilation[i] * windowDims[i] - pad_low[i]`
+template <typename PoolingOp>
+extern SmallVector<AffineExpr, 4>
+weightedPoolingInputIndex(PoolingOp op, ArrayRef<AffineExpr> outputDims,
+                          ArrayRef<AffineExpr> windowDims);
 
 /// Returns `maybeMap.get()` if `maybeMap` is set, otherwise returns the
 /// symbol-less identity map of `rank`.
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index ab53fc30aca8..31b89bc1b2bf 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -251,7 +251,69 @@ def MatmulOp : LinalgStructured_Op<"matmul", [NInputs<2>, NOutputs<1>]> {
   let hasFolder = 1;
 }
 
-def ConvOp : LinalgStructured_Op<"conv", [NInputs<2>, NOutputs<1>]> {
+/// A base class for pooling operation such as conv. The arguments must contain
+/// optional arguments `strides`, `dilations` and `padding` with following type:
+///   OptionalAttr<I64ArrayAttr>:$strides
+///   OptionalAttr<I64ArrayAttr>:$dilations
+///   OptionalAttr<I64ElementsAttr>:$padding
+/// `stirdes` denotes the step of each window along the dimension.
+class PoolingBase_Op<string mnemonic, list<OpTrait> props>
+  : LinalgStructured_Op<mnemonic, props> {
+  let description = [{
+    Performs an N-D pooling operation similarly to the description in the TF
+    documentation:
+    https://www.tensorflow.org/api_docs/python/tf/nn/pool
+
+    Different from the description, this operation doesn't perform on batch and
+    channel. It only takes tensors of rank `N`.
+
+    ```
+      output[x[0], ..., x[N-1]] =
+        REDUCE_{z[0], ..., z[N-1]}
+          input[
+                x[0] * strides[0] - pad_before[0] + dilation_rate[0]*z[0],
+                ...
+                x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1]
+                ],
+    ```
+
+    The required optional arguments are:
+      - strides: an i64 array specifying the stride (i.e. step) for window
+        loops.
+      - dilations: an i64 array specifying the filter upsampling/input
+        downsampling rate
+      - padding: an i64 array of pairs (low, high) specifying the number of
+        elements to pad along a dimension.
+
+    If strides or dilations attributes are missing then the default value is
+    one for each of the input dimensions. Similarly, padding values are zero
+    for both low and high in each of the dimensions, if not specified.
+  }];
+
+  code commonUtils = libraryCallName # [{
+    int64_t getStride(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!strides().hasValue()) return 1;
+      return strides()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+
+    int64_t getDilation(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!dilations().hasValue()) return 1;
+      return dilations()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+
+    int64_t getLowPad(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!padding().hasValue()) return 0;
+      return padding().getValue().getValue<int64_t>({i, 0});
+    }
+  }];
+}
+
+def ConvOp : PoolingBase_Op<"conv", [NInputs<2>, NOutputs<1>]> {
 
   let description = [{
     Generic n-D convolution as described in the TF documentation:
@@ -282,7 +344,7 @@ def ConvOp : LinalgStructured_Op<"conv", [NInputs<2>, NOutputs<1>]> {
                    OptionalAttr<I64ArrayAttr>:$dilations,
                    OptionalAttr<I64ElementsAttr>:$padding);
 
-  let extraClassDeclaration = libraryCallName # [{
+  let extraClassDeclaration = commonUtils # [{
     // TODO(ntv) extend to support more than 1 dimensions and potentially
     // grouping too.
     unsigned getNumBatchDimensions() { return 1; }
@@ -309,26 +371,6 @@ def ConvOp : LinalgStructured_Op<"conv", [NInputs<2>, NOutputs<1>]> {
       return iters;
     }
 
-    int64_t getStride(unsigned i) {
-      assert(i < getNumWindowLoops());
-      if (!strides().hasValue()) return 1;
-      return strides()->getValue()[i]
-        .cast<IntegerAttr>().getValue().getSExtValue();
-    }
-
-    int64_t getDilation(unsigned i) {
-      assert(i < getNumWindowLoops());
-      if (!dilations().hasValue()) return 1;
-      return dilations()->getValue()[i]
-        .cast<IntegerAttr>().getValue().getSExtValue();
-    }
-
-    int64_t getLowPad(unsigned i) {
-      assert(i < getNumWindowLoops());
-      if (!padding().hasValue()) return 0;
-      return padding().getValue().getValue<int64_t>({i, 0});
-    }
-
     //   F(z0, ..., zN-1, q, k) *
     //     I(b, x0 + z0 - pad_low_0, ..., xN-1 + zN-1 - pad_low_N-1, q)
     //   ->  O(b, x0, ..., xN-1, k)
@@ -358,7 +400,7 @@ def ConvOp : LinalgStructured_Op<"conv", [NInputs<2>, NOutputs<1>]> {
       // Window reduction dims: sum_{z[0], ..., z[N-1], q}
       auto zs = makeAffineDimExprs(nWin, idx, context);
       // Construct the weighedSum expression.
-      auto ws = weightedConvInputIndex(*this, xs, zs);
+      auto ws = weightedPoolingInputIndex(*this, xs, zs);
       return SmallVector<AffineMap, 8>{
         // filter[z[0], ..., z[N-1], q, k]
         AffineMap::get(idx, 0, concat(concat(zs, qs), ks)),
@@ -378,6 +420,86 @@ def ConvOp : LinalgStructured_Op<"conv", [NInputs<2>, NOutputs<1>]> {
   let hasFolder = 1;
 }
 
+class SingleInputPoolingBase_Op<string mnemonic>
+  : PoolingBase_Op<mnemonic, [NInputs<2>, NOutputs<1>]> {
+  let description = [{
+    A base class for single input pooling function.
+
+    TODO: Figure out a better way to handle window dimensions, i.e., eliminate
+    the fake memref.
+    The window dimensions are specified by argument `windowDims`. The i-th
+    dimension in the shape of `windowDims` denotes the size of the window along
+    dimension i. For example, if the window size is 2x3, then a memref<2x3>
+    should be passed to the operation as `windowDims`.
+  }];
+
+  let arguments = (ins AnyStridedMemRef:$input,
+                   AnyStridedMemRef:$windowDims,
+                   AnyStridedMemRef:$output,
+                   OptionalAttr<I64ArrayAttr>:$strides,
+                   OptionalAttr<I64ArrayAttr>:$dilations,
+                   OptionalAttr<I64ElementsAttr>:$padding);
+
+  let extraClassDeclaration = commonUtils# [{
+    llvm::Optional<SmallVector<StringRef, 8>> referenceIterators() {
+      // Outer parallel loops are always the number of output dimensions.
+      unsigned nPar = getOutputShapedType(0).getRank();
+      // The window loops has the same number loops with output dimensions.
+      unsigned nWin = nPar;
+      SmallVector<StringRef, 8> iters(nPar, getParallelIteratorTypeName());
+      iters.reserve(nPar + nWin);
+      iters.append(nWin, getWindowIteratorTypeName());
+      return iters;
+    }
+
+    llvm::Optional<SmallVector<AffineMap, 8>> referenceIndexingMaps() {
+      MLIRContext *context = getContext();
+      auto nPar = getNumParallelLoops();
+      auto nWin = getNumWindowLoops();
+      assert(nWin > 0 && "expected at least one window dimension");
+      unsigned idx = 0;
+      auto outputDims = makeAffineDimExprs(nPar, idx, context);
+      auto windowDims = makeAffineDimExprs(nWin, idx, context);
+      // Construct the weighedSum expression.
+      auto inputDims =
+          weightedPoolingInputIndex(*this, outputDims, windowDims);
+      return SmallVector<AffineMap, 8>{
+        // input
+        AffineMap::get(idx, 0, inputDims),
+        // windowDims
+        AffineMap::get(idx, 0, windowDims),
+        // output
+        AffineMap::get(idx, 0, outputDims)
+        };
+    }
+  }];
+
+  let verifier = [{ return ::verify(*this); }];
+
+  let hasFolder = 1;
+}
+
+def PoolingMaxOp: SingleInputPoolingBase_Op<"pooling_max"> {
+  let description = [{
+    Takes max op as pooling operation, i.e., it samples the maximum value in the
+    window.
+  }];
+}
+
+def PoolingMinOp: SingleInputPoolingBase_Op<"pooling_min"> {
+  let description = [{
+    Takes min op as pooling operation, i.e., it samples the minimum value in the
+    window.
+  }];
+}
+
+def PoolingSumOp: SingleInputPoolingBase_Op<"pooling_sum"> {
+  let description = [{
+    Takes add op as pooling operation, i.e., it accumulates the values in the
+    window.
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Generic Linalg ops.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
index 8fcc1ceea502..46fb9881aba5 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
@@ -100,6 +100,10 @@ def LinalgStructuredInterface : OpInterface<"LinalgOp"> {
     //===------------------------------------------------------------------===//
     // Input and Output arguments handling.
     //===------------------------------------------------------------------===//
+    InterfaceMethod<
+      "Return one single buffer at position `$i`.",
+      "Value", "getBuffer", (ins "unsigned":$i)
+    >,
     InterfaceMethod<
       "Return the number of inputs and outputs, irrespective of their buffer "
       "or tensor type.",
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
index f546d3670b6a..b13b6d268226 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
@@ -184,6 +184,10 @@ class StructuredOpTraits
   //==========================================================================//
   // Input and Output arguments handling.
   //==========================================================================//
+  Value getBuffer(unsigned i) {
+    assert(i < getNumInputsAndOutputBuffers() && "overflowing buffers index");
+    return this->getOperation()->getOperand(i);
+  }
   /// Return the number of inputs and outputs, irrespective of their buffer or
   /// tensor type.
   unsigned getNumInputsAndOutputs() { return nInputs() + nOutputs(); }
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index 61f88c10470a..9f52e360c7fb 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -20,8 +20,10 @@ namespace mlir {
 class FuncOp;
 class ModuleOp;
 template <typename T> class OpPassBase;
+class Pass;
 
 std::unique_ptr<OpPassBase<FuncOp>> createLinalgFusionPass();
+std::unique_ptr<Pass> createLinalgFusionOfTensorOpsPass();
 
 std::unique_ptr<OpPassBase<FuncOp>>
 createLinalgTilingPass(ArrayRef<int64_t> tileSizes = {});
@@ -31,6 +33,7 @@ createLinalgTilingToParallelLoopsPass(ArrayRef<int64_t> tileSizes = {});
 
 std::unique_ptr<OpPassBase<FuncOp>>
 createLinalgPromotionPass(bool dynamicBuffers);
+std::unique_ptr<OpPassBase<FuncOp>> createLinalgPromotionPass();
 
 /// Create a pass to convert Linalg operations to loop.for loops and
 /// std.load/std.store accesses.
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
new file mode 100644
index 000000000000..210ad1092c66
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -0,0 +1,70 @@
+//===-- Passes.td - Linalg pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_PASSES
+#define MLIR_DIALECT_LINALG_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def LinalgFusion : Pass<"linalg-fusion"> {
+  let summary = "Fuse operations in the linalg dialect";
+  let constructor = "mlir::createLinalgFusionPass()";
+}
+
+def LinalgFusionOfTensorOps : Pass<"linalg-fusion-for-tensor-ops"> {
+  let summary = "Fuse operations on RankedTensorType in linalg dialect";
+  let constructor = "mlir::createLinalgFusionOfTensorOpsPass()";
+}
+
+def LinalgLowerToAffineLoops : Pass<"convert-linalg-to-affine-loops"> {
+  let summary = "Lower the operations from the linalg dialect into affine "
+                "loops";
+  let constructor = "mlir::createConvertLinalgToAffineLoopsPass()";
+}
+
+def LinalgLowerToLoops : Pass<"convert-linalg-to-loops"> {
+  let summary = "Lower the operations from the linalg dialect into loops";
+  let constructor = "mlir::createConvertLinalgToLoopsPass()";
+}
+
+def LinalgLowerToParallelLoops : Pass<"convert-linalg-to-parallel-loops"> {
+  let summary = "Lower the operations from the linalg dialect into parallel "
+                "loops";
+  let constructor = "mlir::createConvertLinalgToParallelLoopsPass()";
+}
+
+def LinalgPromotion : Pass<"linalg-promote-subviews"> {
+  let summary = "Promote subview ops to local buffers";
+  let constructor = "mlir::createLinalgPromotionPass()";
+  let options = [
+    Option<"dynamicBuffers", "test-promote-dynamic", "bool",
+           /*default=*/"false", "Test generation of dynamic promoted buffers">
+  ];
+}
+
+def LinalgTiling : Pass<"linalg-tile"> {
+  let summary = "Tile operations in the linalg dialect";
+  let constructor = "mlir::createLinalgTilingPass()";
+  let options = [
+    ListOption<"tileSizes", "linalg-tile-sizes", "int64_t",
+               "Test generation of dynamic promoted buffers",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+  ];
+}
+
+def LinalgTilingToParallelLoops : Pass<"linalg-tile-to-parallel-loops"> {
+  let summary = "Tile operations in the linalg dialect to parallel loops";
+  let constructor = "mlir::createLinalgTilingToParallelLoopsPass()";
+  let options = [
+    ListOption<"tileSizes", "linalg-tile-sizes", "int64_t",
+               "Test generation of dynamic promoted buffers",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+  ];
+}
+
+#endif // MLIR_DIALECT_LINALG_PASSES
diff --git a/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt
index 4a838cc1d52d..2627cbb542da 100644
--- a/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt
@@ -1,2 +1,8 @@
 add_mlir_dialect(LoopOps loop)
 add_mlir_doc(LoopOps -gen-dialect-doc LoopDialect Dialects/)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRLoopPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc LoopPasses ./)
diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
index 08f61c493e02..e202a4013a19 100644
--- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -246,7 +246,9 @@ def IfOp : Loop_Op<"if",
 }
 
 def ParallelOp : Loop_Op<"parallel",
-    [AttrSizedOperandSegments, SingleBlockImplicitTerminator<"YieldOp">]> {
+    [AttrSizedOperandSegments,
+     DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+     SingleBlockImplicitTerminator<"YieldOp">]> {
   let summary = "parallel for operation";
   let description = [{
     The "loop.parallel" operation represents a loop nest taking 4 groups of SSA
diff --git a/mlir/include/mlir/Dialect/LoopOps/Passes.td b/mlir/include/mlir/Dialect/LoopOps/Passes.td
new file mode 100644
index 000000000000..444dcfe22201
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LoopOps/Passes.td
@@ -0,0 +1,34 @@
+//===-- Passes.td - Loop pass definition file --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LOOP_PASSES
+#define MLIR_DIALECT_LOOP_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def LoopParallelLoopFusion : Pass<"parallel-loop-fusion"> {
+  let summary = "Fuse adjacent parallel loops";
+  let constructor = "mlir::createParallelLoopFusionPass()";
+}
+
+def LoopParallelLoopSpecialization : Pass<"parallel-loop-specialization"> {
+  let summary = "Specialize parallel loops for vectorization";
+  let constructor = "mlir::createParallelLoopSpecializationPass()";
+}
+
+def LoopParallelLoopTiling : Pass<"parallel-loop-tiling"> {
+  let summary = "Tile parallel loops";
+  let constructor = "mlir::createParallelLoopTilingPass()";
+  let options = [
+    ListOption<"tileSizes", "parallel-loop-tile-sizes", "int64_t",
+               "Factors to tile parallel loops by",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+  ];
+}
+
+#endif // MLIR_DIALECT_LOOP_PASSES
diff --git a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt
index b18726736e94..1a48e4928b33 100644
--- a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt
@@ -1,2 +1,8 @@
 add_mlir_dialect(QuantOps quant)
 add_mlir_doc(QuantOps -gen-dialect-doc QuantDialect Dialects/)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRQuantPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc QuantPasses ./)
diff --git a/mlir/include/mlir/Dialect/Quant/Passes.td b/mlir/include/mlir/Dialect/Quant/Passes.td
new file mode 100644
index 000000000000..f55a43006977
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Quant/Passes.td
@@ -0,0 +1,26 @@
+//===-- Passes.td - Quant pass definition file -------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANT_PASSES
+#define MLIR_DIALECT_QUANT_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def QuantConvertConst : Pass<"quant-convert-const"> {
+  let summary = "Converts constants followed by qbarrier to actual quantized "
+                "values";
+  let constructor = "mlir::quant::createConvertConstPass()";
+}
+
+def QuantConvertSimulatedQuant : Pass<"quant-convert-simulated-quantization"> {
+  let summary = "Converts training-time simulated quantization ops to "
+                "corresponding quantize/dequantize casts";
+  let constructor = "mlir::quant::createConvertSimulatedQuantPass()";
+}
+
+#endif // MLIR_DIALECT_QUANT_PASSES
diff --git a/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt
index 771d4c1a43bb..15f4a4dfe847 100644
--- a/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_mlir_dialect(SPIRVOps spv)
-add_mlir_doc(SPIRVOps -gen-dialect-doc SPIRVDialect Dialects/)
+add_mlir_doc(SPIRVOps -gen-op-doc SPIRVOps Dialects/)
 
 set(LLVM_TARGET_DEFINITIONS SPIRVBase.td)
 mlir_tablegen(SPIRVEnums.h.inc -gen-enum-decls)
@@ -30,3 +30,9 @@ set(LLVM_TARGET_DEFINITIONS TargetAndABI.td)
 mlir_tablegen(TargetAndABI.h.inc -gen-struct-attr-decls)
 mlir_tablegen(TargetAndABI.cpp.inc -gen-struct-attr-defs)
 add_public_tablegen_target(MLIRSPIRVTargetAndABIIncGen)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRSPIRVPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc SPIRVPasses ./)
diff --git a/mlir/include/mlir/Dialect/SPIRV/Passes.td b/mlir/include/mlir/Dialect/SPIRV/Passes.td
new file mode 100644
index 000000000000..a03849955797
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/Passes.td
@@ -0,0 +1,30 @@
+//===-- Passes.td - SPIRV pass definition file -------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_PASSES
+#define MLIR_DIALECT_SPIRV_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def SPIRVCompositeTypeLayout : Pass<"decorate-spirv-composite-type-layout"> {
+  let summary = "Decorate SPIR-V composite type with layout info";
+  let constructor = "mlir::spirv::createDecorateSPIRVCompositeTypeLayoutPass()";
+}
+
+def SPIRVLowerABIAttributes : Pass<"spirv-lower-abi-attrs"> {
+  let summary = "Decorate SPIR-V composite type with layout info";
+  let constructor = "mlir::spirv::createLowerABIAttributesPass()";
+}
+
+def SPIRVUpdateVCE : Pass<"spirv-update-vce"> {
+  let summary = "Deduce and attach minimal (version, capabilities, extensions) "
+                "requirements to spv.module ops";
+  let constructor = "mlir::spirv::createUpdateVersionCapabilityExtensionPass()";
+}
+
+#endif // MLIR_DIALECT_SPIRV_PASSES
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index d54791a65410..bb37bb28a18c 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -72,6 +72,15 @@ constexpr StringRef getFunAttrName() { return "fun"; }
 /// function that implements the structured op.
 constexpr StringRef getLibraryCallAttrName() { return "library_call"; }
 
+/// Attribute name for the StrArrayAttr which encodes the value of strides.
+constexpr StringRef getStridesAttrName() { return "strides"; }
+
+/// Attribute name for the StrArrayAttr which encodes the value of dilations.
+constexpr StringRef getDilationsAttrName() { return "dilations"; }
+
+/// Attribute name for the StrArrayAttr which encodes the value of paddings.
+constexpr StringRef getPaddingAttrName() { return "padding"; }
+
 /// Use to encode that a particular iterator type has parallel semantics.
 constexpr StringRef getParallelIteratorTypeName() { return "parallel"; }
 
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
index d0629e4ff724..a0ad88347bd9 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -1315,10 +1315,12 @@ def Vector_TransposeOp :
     VectorType getResultType() {
       return result().getType().cast<VectorType>();
     }
+    void getTransp(SmallVectorImpl<int64_t> &results);
   }];
   let assemblyFormat = [{
     $vector `,` $transp attr-dict `:` type($vector) `to` type($result)
   }];
+  let hasFolder = 1;
 }
 
 def Vector_TupleGetOp :
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index 14deb85fb2f0..ff65da62a115 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -49,9 +49,16 @@ class AffineMap {
   static AffineMap get(unsigned dimCount, unsigned symbolCount,
                        MLIRContext *context);
 
+  /// Returns an affine map with `dimCount` dimensions and `symbolCount` symbols
+  /// mapping to the given results. The array of results cannot be empty.
   static AffineMap get(unsigned dimCount, unsigned symbolCount,
                        ArrayRef<AffineExpr> results);
 
+  /// Returns an affine map with `dimCount` dimensions and `symbolCount` mapping
+  /// to the given results, where the number of results can be zero.
+  static AffineMap get(unsigned dimCount, unsigned symbolCount,
+                       ArrayRef<AffineExpr> results, MLIRContext *context);
+
   /// Returns a single constant result affine map.
   static AffineMap getConstantMap(int64_t val, MLIRContext *context);
 
@@ -208,9 +215,13 @@ struct MutableAffineMap {
   MLIRContext *context;
 };
 
-/// Simplify an affine map by simplifying its underlying AffineExpr results.
+/// Simplifies an affine map by simplifying its underlying AffineExpr results.
 AffineMap simplifyAffineMap(AffineMap map);
 
+/// Returns a map with the same dimension and symbol count as `map`, but whose
+/// results are the unique affine expressions of `map`.
+AffineMap removeDuplicateExprs(AffineMap map);
+
 /// Returns a map of codomain to domain dimensions such that the first codomain
 /// dimension for a particular domain dimension is selected.
 /// Returns an empty map if the input map is empty or if `map` is not invertible
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 5b42132d463a..1c6b16f22989 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -188,13 +188,23 @@ class OpBuilder : public Builder {
     setInsertionPoint(op);
   }
 
-  explicit OpBuilder(Block *block) : OpBuilder(block, block->end()) {}
-
   OpBuilder(Block *block, Block::iterator insertPoint)
       : OpBuilder(block->getParent()) {
     setInsertionPoint(block, insertPoint);
   }
 
+  /// Create a builder and set the insertion point to before the first operation
+  /// in the block but still inside th block.
+  static OpBuilder atBlockBegin(Block *block) {
+    return OpBuilder(block, block->begin());
+  }
+
+  /// Create a builder and set the insertion point to after the last operation
+  /// in the block but still inside the block.
+  static OpBuilder atBlockEnd(Block *block) {
+    return OpBuilder(block, block->end());
+  }
+
   /// This class represents a saved insertion point.
   class InsertPoint {
   public:
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 4671b4081401..08d267a9cf18 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -22,8 +22,11 @@
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
 #include "mlir/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.h"
+#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
 #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/FxpMathOps/Passes.h"
 #include "mlir/Dialect/GPU/Passes.h"
@@ -35,6 +38,8 @@
 #include "mlir/Quantizer/Transforms/Passes.h"
 #include "mlir/Transforms/LocationSnapshot.h"
 #include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/ViewOpGraph.h"
+#include "mlir/Transforms/ViewRegionGraph.h"
 
 #include <cstdlib>
 
@@ -48,94 +53,47 @@ namespace mlir {
 // individual passes.
 // The global registry is interesting to interact with the command-line tools.
 inline void registerAllPasses() {
-  // At the moment we still rely on global initializers for registering passes,
-  // but we may not do it in the future.
-  // We must reference the passes in such a way that compilers will not
-  // delete it all as dead code, even with whole program optimization,
-  // yet is effectively a NO-OP. As the compiler isn't smart enough
-  // to know that getenv() never returns -1, this will do the job.
-  if (std::getenv("bar") != (char *)-1)
-    return;
-
   // Init general passes
-  createCanonicalizerPass();
-  createCSEPass();
-  createSuperVectorizePass({});
-  createLoopUnrollPass();
-  createLoopUnrollAndJamPass();
-  createSimplifyAffineStructuresPass();
-  createLoopFusionPass();
-  createLoopInvariantCodeMotionPass();
-  createAffineLoopInvariantCodeMotionPass();
-  createPipelineDataTransferPass();
-  createLowerAffinePass();
-  createLoopTilingPass(0);
-  createLoopCoalescingPass();
-  createAffineDataCopyGenerationPass(0, 0);
-  createMemRefDataFlowOptPass();
-  createStripDebugInfoPass();
-  createPrintOpStatsPass();
-  createInlinerPass();
-  createSymbolDCEPass();
-  createLocationSnapshotPass({});
-
-  // AVX512
-  createConvertAVX512ToLLVMPass();
-
-  // GPUtoRODCLPass
-  createLowerGpuOpsToROCDLOpsPass();
-
-  // FxpOpsDialect passes
-  fxpmath::createLowerUniformRealMathPass();
-  fxpmath::createLowerUniformCastsPass();
+#define GEN_PASS_REGISTRATION
+#include "mlir/Transforms/Passes.h.inc"
 
-  // GPU
-  createGpuKernelOutliningPass();
-  createSimpleLoopsToGPUPass(0, 0);
-  createLoopToGPUPass({}, {});
+  // Conversion passes
+#define GEN_PASS_REGISTRATION
+#include "mlir/Conversion/Passes.h.inc"
+
+  // Affine
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/Affine/Passes.h.inc"
 
-  // CUDA
-  createConvertGpuLaunchFuncToCudaCallsPass();
-  createLowerGpuOpsToNVVMOpsPass();
+  // FxpMath
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/FxpMathOps/Passes.h.inc"
+
+  // GPU
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/GPU/Passes.h.inc"
 
   // Linalg
-  createLinalgFusionPass();
-  createLinalgTilingPass();
-  createLinalgTilingToParallelLoopsPass();
-  createLinalgPromotionPass(0);
-  createConvertLinalgToLoopsPass();
-  createConvertLinalgToParallelLoopsPass();
-  createConvertLinalgToAffineLoopsPass();
-  createConvertLinalgToLLVMPass();
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/Linalg/Passes.h.inc"
 
   // LLVM
-  LLVM::createLegalizeForExportPass();
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
 
-  // LoopOps
-  createParallelLoopCollapsingPass();
-  createParallelLoopFusionPass();
-  createParallelLoopSpecializationPass();
-  createParallelLoopTilingPass();
+  // Loop
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/LoopOps/Passes.h.inc"
 
-  // QuantOps
-  quant::createConvertSimulatedQuantPass();
-  quant::createConvertConstPass();
-  quantizer::createAddDefaultStatsPass();
-  quantizer::createRemoveInstrumentationPass();
-  quantizer::registerInferQuantizedTypesPass();
+  // Quant
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/Quant/Passes.h.inc"
+#define GEN_PASS_REGISTRATION
+#include "mlir/Quantizer/Transforms/Passes.h.inc"
 
   // SPIR-V
-  spirv::createDecorateSPIRVCompositeTypeLayoutPass();
-  spirv::createLowerABIAttributesPass();
-  spirv::createUpdateVersionCapabilityExtensionPass();
-  createConvertGPUToSPIRVPass();
-  createConvertStandardToSPIRVPass();
-  createLegalizeStdOpsForSPIRVLoweringPass();
-  createLinalgToSPIRVPass();
-
-  // Vulkan
-  createConvertGpuLaunchFuncToVulkanLaunchFuncPass();
-  createConvertVulkanLaunchFuncToVulkanCallsPass();
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/SPIRV/Passes.h.inc"
 }
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Pass/PassBase.td b/mlir/include/mlir/Pass/PassBase.td
new file mode 100644
index 000000000000..29478047f757
--- /dev/null
+++ b/mlir/include/mlir/Pass/PassBase.td
@@ -0,0 +1,85 @@
+//===-- PassBase.td - Base pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions for defining pass registration and other
+// mechanisms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_PASSBASE
+#define MLIR_PASS_PASSBASE
+
+//===----------------------------------------------------------------------===//
+// Options
+//===----------------------------------------------------------------------===//
+
+class Option<string varName, string arg, string valueType, string default,
+             string desc, string additionalFlags = ""> {
+  // The name for the C++ option variable.
+  string cppName = varName;
+
+  // The command line argument to use for this option.
+  string argument = arg;
+
+  // The C++ type of the option.
+  string type = valueType;
+
+  // The default value of the option. "" corresponds to no default.
+  string defaultValue = default;
+
+  // A description for this option.
+  string description = desc;
+
+  // A set of additional flags to pass along to the option constructor.
+  string additionalOptFlags = additionalFlags;
+}
+
+class ListOption<string varName, string arg, string valueType,
+                 string desc, string additionalFlags = "">
+  : Option<varName, arg, valueType, /*default=*/"", desc, additionalFlags> {}
+
+//===----------------------------------------------------------------------===//
+// Statistics
+//===----------------------------------------------------------------------===//
+
+class Statistic<string varName, string statName, string desc> {
+  // The C++ variable name for the statistic.
+  string cppName = varName;
+
+  // The displayed name of the statistic, similar to the argument of an option.
+  string name = statName;
+
+  // The description of the statistic.
+  string description = desc;
+}
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+class Pass<string passArg> {
+  // The command line argument of the pass.
+  string argument = passArg;
+
+  // A short 1-line summary of the pass.
+  string summary = "";
+
+  // A human readable description of the pass.
+  string description = "";
+
+  // A C++ constructor call to create an instance of this pass.
+  code constructor = [{}];
+
+  // A set of options provided by this pass.
+  list<Option> options = [];
+
+  // A set of statistics provided by this pass.
+  list<Statistic> statistics = [];
+}
+
+#endif // MLIR_PASS_PASSBASE
diff --git a/mlir/include/mlir/Pass/PassRegistry.h b/mlir/include/mlir/Pass/PassRegistry.h
index fab118d229c6..31c09c125919 100644
--- a/mlir/include/mlir/Pass/PassRegistry.h
+++ b/mlir/include/mlir/Pass/PassRegistry.h
@@ -122,7 +122,7 @@ void registerPassPipeline(
 
 /// Register a specific dialect pass allocator function with the system,
 /// typically used through the PassRegistration template.
-void registerPass(StringRef arg, StringRef description, const PassID *passID,
+void registerPass(StringRef arg, StringRef description,
                   const PassAllocatorFunction &function);
 
 /// PassRegistration provides a global initializer that registers a Pass
@@ -138,7 +138,7 @@ void registerPass(StringRef arg, StringRef description, const PassID *passID,
 template <typename ConcretePass> struct PassRegistration {
   PassRegistration(StringRef arg, StringRef description,
                    const PassAllocatorFunction &constructor) {
-    registerPass(arg, description, PassID::getID<ConcretePass>(), constructor);
+    registerPass(arg, description, constructor);
   }
 
   PassRegistration(StringRef arg, StringRef description)
diff --git a/mlir/include/mlir/Quantizer/CMakeLists.txt b/mlir/include/mlir/Quantizer/CMakeLists.txt
new file mode 100644
index 000000000000..e31af3266116
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Quantizer/Transforms/CMakeLists.txt b/mlir/include/mlir/Quantizer/Transforms/CMakeLists.txt
new file mode 100644
index 000000000000..b828c7224331
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Transforms/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRQuantizerPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc QuantizerPasses ./)
diff --git a/mlir/include/mlir/Quantizer/Transforms/Passes.h b/mlir/include/mlir/Quantizer/Transforms/Passes.h
index 3cd0475d2f8e..5877dc525a94 100644
--- a/mlir/include/mlir/Quantizer/Transforms/Passes.h
+++ b/mlir/include/mlir/Quantizer/Transforms/Passes.h
@@ -27,9 +27,7 @@ class TargetConfiguration;
 std::unique_ptr<OpPassBase<ModuleOp>>
 createInferQuantizedTypesPass(SolverContext &solverContext,
                               const TargetConfiguration &config);
-
-/// Registers the InferQuantizedTypes pass with the global registry.
-void registerInferQuantizedTypesPass();
+std::unique_ptr<OpPassBase<ModuleOp>> createInferQuantizedTypesPass();
 
 /// Creates a pass which removes any instrumentation and hint ops which have
 /// no effect on final runtime.
diff --git a/mlir/include/mlir/Quantizer/Transforms/Passes.td b/mlir/include/mlir/Quantizer/Transforms/Passes.td
new file mode 100644
index 000000000000..e89674478b17
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Transforms/Passes.td
@@ -0,0 +1,31 @@
+//===-- Passes.td - Quantizer pass definition file ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_TRANSFORMS_PASSES
+#define MLIR_QUANTIZER_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def QuantizerAddDefaultStats : Pass<"quantizer-add-default-stats-test"> {
+  let summary = "Add default (dummy) statistics to all ops that can benefit "
+                "from runtime statistics";
+  let constructor = "mlir::quantizer::createAddDefaultStatsPass()";
+}
+
+def QuantizerInferQuantizedTypes : Pass<"quantizer-infer-quantized-types"> {
+  let summary = "Infer quantized types for a module";
+  let constructor = "mlir::quantizer::createInferQuantizedTypesPass()";
+}
+
+def QuantizerRemoveInstrumentation : Pass<"quantizer-remove-instrumentation"> {
+  let summary = "Remove instrumentation and hints which have no effect on "
+                "final execution";
+  let constructor = "mlir::quantizer::createRemoveInstrumentationPass()";
+}
+
+#endif // MLIR_QUANTIZER_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/TableGen/Pass.h b/mlir/include/mlir/TableGen/Pass.h
new file mode 100644
index 000000000000..d8b5eaa04313
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Pass.h
@@ -0,0 +1,111 @@
+//===- Pass.h - TableGen pass definitions -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_PASS_H_
+#define MLIR_TABLEGEN_PASS_H_
+
+#include "mlir/Support/LLVM.h"
+#include <vector>
+
+namespace llvm {
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+//===----------------------------------------------------------------------===//
+// PassOption
+//===----------------------------------------------------------------------===//
+class PassOption {
+public:
+  explicit PassOption(const llvm::Record *def) : def(def) {}
+
+  /// Return the name for the C++ option variable.
+  StringRef getCppVariableName() const;
+
+  /// Return the command line argument to use for this option.
+  StringRef getArgument() const;
+
+  /// Return the C++ type of the option.
+  StringRef getType() const;
+
+  /// Return the default value of the option.
+  Optional<StringRef> getDefaultValue() const;
+
+  /// Return the description for this option.
+  StringRef getDescription() const;
+
+  /// Return the additional flags passed to the option constructor.
+  Optional<StringRef> getAdditionalFlags() const;
+
+  /// Flag indicating if this is a list option.
+  bool isListOption() const;
+
+private:
+  const llvm::Record *def;
+};
+
+//===----------------------------------------------------------------------===//
+// PassStatistic
+//===----------------------------------------------------------------------===//
+class PassStatistic {
+public:
+  explicit PassStatistic(const llvm::Record *def) : def(def) {}
+
+  /// Return the name for the C++ statistic variable.
+  StringRef getCppVariableName() const;
+
+  /// Return the name of the statistic.
+  StringRef getName() const;
+
+  /// Return the description for this statistic.
+  StringRef getDescription() const;
+
+private:
+  const llvm::Record *def;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+/// Wrapper class providing helper methods for Passes defined in TableGen.
+class Pass {
+public:
+  explicit Pass(const llvm::Record *def);
+
+  /// Return the command line argument of the pass.
+  StringRef getArgument() const;
+
+  /// Return the short 1-line summary of the pass.
+  StringRef getSummary() const;
+
+  /// Return the description of the pass.
+  StringRef getDescription() const;
+
+  /// Return the C++ constructor call to create an instance of this pass.
+  StringRef getConstructor() const;
+
+  /// Return the options provided by this pass.
+  ArrayRef<PassOption> getOptions() const;
+
+  /// Return the statistics provided by this pass.
+  ArrayRef<PassStatistic> getStatistics() const;
+
+  const llvm::Record *getDef() const { return def; }
+
+private:
+  const llvm::Record *def;
+  std::vector<PassOption> options;
+  std::vector<PassStatistic> statistics;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_PASS_H_
diff --git a/mlir/include/mlir/Transforms/CMakeLists.txt b/mlir/include/mlir/Transforms/CMakeLists.txt
new file mode 100644
index 000000000000..706193188edd
--- /dev/null
+++ b/mlir/include/mlir/Transforms/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(MLIRTransformsPassIncGen)
+
+add_mlir_doc(Passes -gen-pass-doc GeneralPasses ./)
diff --git a/mlir/include/mlir/Transforms/LocationSnapshot.h b/mlir/include/mlir/Transforms/LocationSnapshot.h
index e4c911ae07d6..9012494cd46f 100644
--- a/mlir/include/mlir/Transforms/LocationSnapshot.h
+++ b/mlir/include/mlir/Transforms/LocationSnapshot.h
@@ -58,6 +58,8 @@ LogicalResult generateLocationsFromIR(StringRef fileName, StringRef tag,
 std::unique_ptr<Pass> createLocationSnapshotPass(OpPrintingFlags flags,
                                                  StringRef fileName = "",
                                                  StringRef tag = "");
+/// Overload utilizing pass options for initialization.
+std::unique_ptr<Pass> createLocationSnapshotPass();
 
 } // end namespace mlir
 
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index 5b36cdc950f4..3bc34f1444e0 100644
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -72,20 +72,11 @@ LogicalResult promoteIfSingleIteration(AffineForOp forOp);
 /// their body into the containing Block.
 void promoteSingleIterationLoops(FuncOp f);
 
-/// Computes the cleanup loop lower bound of the loop being unrolled with
-/// the specified unroll factor; this bound will also be upper bound of the main
-/// part of the unrolled loop. Computes the bound as an AffineMap with its
-/// operands or a null map when the trip count can't be expressed as an affine
-/// expression.
-void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
-                              AffineMap *map, SmallVectorImpl<Value> *operands,
-                              OpBuilder &builder);
-
-/// Skew the operations in the body of an affine.for operation with the
-/// specified operation-wise shifts. The shifts are with respect to the
-/// original execution order, and are multiplied by the loop 'step' before being
-/// applied. If `unrollPrologueEpilogue` is set, fully unroll the prologue and
-/// epilogue loops when possible.
+/// Skew the operations in an affine.for's body with the specified
+/// operation-wise shifts. The shifts are with respect to the original execution
+/// order, and are multiplied by the loop 'step' before being applied. If
+/// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue
+/// loops when possible.
 LLVM_NODISCARD
 LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
                                   bool unrollPrologueEpilogue = false);
@@ -232,8 +223,8 @@ void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
 /// Take the ParallelLoop and for each set of dimension indices, combine them
 /// into a single dimension. combinedDimensions must contain each index into
 /// loops exactly once.
-void collapsePLoops(loop::ParallelOp loops,
-                    ArrayRef<std::vector<unsigned>> combinedDimensions);
+void collapseParallelLoops(loop::ParallelOp loops,
+                           ArrayRef<std::vector<unsigned>> combinedDimensions);
 
 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
 /// size given by `numProcessors`. This is achieved by embedding the SSA values
@@ -274,8 +265,8 @@ void gatherLoops(FuncOp func,
                  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops);
 
 /// Creates an AffineForOp while ensuring that the lower and upper bounds are
-/// canonicalized, i.e., unused and duplicate operands are removed, and any
-/// constant operands propagated/folded in.
+/// canonicalized, i.e., unused and duplicate operands are removed, any constant
+/// operands propagated/folded in, and duplicate bound maps dropped.
 AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc,
                                            ValueRange lbOperands,
                                            AffineMap lbMap,
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
new file mode 100644
index 000000000000..087300453280
--- /dev/null
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -0,0 +1,220 @@
+//===-- Passes.td - Transforms pass definition file --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions for passes within the Transforms/ directory.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_PASSES
+#define MLIR_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def AffinePipelineDataTransfer : Pass<
+    "affine-pipeline-data-transfer"> {
+  let summary = "Pipeline non-blocking data transfers between explicitly "
+                "managed levels of the memory hierarchy";
+  let description = [{
+    This pass performs a transformation to overlap non-blocking DMA operations
+    in a loop with computations through double buffering. This is achieved by
+    advancing dma_start operations with respect to other operations.
+
+    Input
+
+    ```mlir
+    func @pipelinedatatransfer() {
+      %0 = alloc() : memref<256xf32>
+      %1 = alloc() : memref<32xf32, 1>
+      %2 = alloc() : memref<1xf32>
+      %c0 = constant 0 : index
+      %c128 = constant 128 : index
+      affine.for %i0 = 0 to 8 {
+        affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
+        affine.dma_wait %2[%c0], %c128 : memref<1xf32>
+        %3 = affine.load %1[%i0] : memref<32xf32, 1>
+        %4 = "compute"(%3) : (f32) -> f32
+        affine.store %4, %1[%i0] : memref<32xf32, 1>
+      }
+      return
+    }
+    ```
+
+    Output
+
+    ```mlir
+    module {
+      func @pipelinedatatransfer() {
+        %c8 = constant 8 : index
+        %c0 = constant 0 : index
+        %0 = alloc() : memref<256xf32>
+        %c0_0 = constant 0 : index
+        %c128 = constant 128 : index
+        %1 = alloc() : memref<2x32xf32, 1>
+        %2 = alloc() : memref<2x1xf32>
+        affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+        affine.for %arg0 = 1 to 8 {
+          affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+          %8 = affine.apply #map3(%arg0)
+          %9 = affine.apply #map4(%8)
+          %10 = affine.apply #map4(%8)
+          affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+          %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+          %12 = "compute"(%11) : (f32) -> f32
+          affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+        }
+        %3 = affine.apply #map3(%c8)
+        %4 = affine.apply #map4(%3)
+        %5 = affine.apply #map4(%3)
+        affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+        %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+        %7 = "compute"(%6) : (f32) -> f32
+        affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+        dealloc %2 : memref<2x1xf32>
+        dealloc %1 : memref<2x32xf32, 1>
+        return
+      }
+    }
+    ```
+  }];
+  let constructor = "mlir::createPipelineDataTransferPass()";
+}
+
+def AffineLoopFusion : Pass<"affine-loop-fusion"> {
+  let summary = "Fuse affine loop nests";
+  let constructor = "mlir::createLoopFusionPass()";
+}
+
+def Canonicalizer : Pass<"canonicalize"> {
+  let summary = "Canonicalize operations";
+  let constructor = "mlir::createCanonicalizerPass()";
+}
+
+def CSE : Pass<"cse"> {
+  let summary = "Eliminate common sub-expressions";
+  let constructor = "mlir::createCSEPass()";
+  let statistics = [
+    Statistic<"numCSE", "num-cse'd", "Number of operations CSE'd">,
+    Statistic<"numDCE", "num-dce'd", "Number of operations DCE'd">
+  ];
+}
+
+def Inliner : Pass<"inline"> {
+  let summary = "Inline function calls";
+  let constructor = "mlir::createInlinerPass()";
+}
+
+def LocationSnapshot : Pass<"snapshot-op-locations"> {
+  let summary = "Generate new locations from the current IR";
+  let constructor = "mlir::createLocationSnapshotPass()";
+  let options = [
+    Option<"fileName", "filename", "std::string", /*default=*/"",
+           "The filename to print the generated IR.">,
+    Option<"tag", "tag", "std::string", /*default=*/"",
+           "A tag to use when fusing the new locations with the "
+           "original. If unset, the locations are replaced.">,
+  ];
+}
+
+def LoopCoalescing : Pass<"loop-coalescing"> {
+  let summary = "Coalesce nested loops with independent bounds into a single "
+                "loop";
+  let constructor = "mlir::createLoopCoalescingPass()";
+}
+
+def LoopInvariantCodeMotion : Pass<"loop-invariant-code-motion"> {
+  let summary = "Hoist loop invariant instructions outside of the loop";
+  let constructor = "mlir::createLoopInvariantCodeMotionPass()";
+}
+
+def MemRefDataFlowOpt : Pass<"memref-dataflow-opt"> {
+  let summary = "Perform store/load forwarding for memrefs";
+  let description = [{
+    This pass performs store to load forwarding for memref's to eliminate memory
+    accesses and potentially the entire memref if all its accesses are
+    forwarded.
+
+    Input
+
+    ```mlir
+    func @store_load_affine_apply() -> memref<10x10xf32> {
+      %cf7 = constant 7.0 : f32
+      %m = alloc() : memref<10x10xf32>
+      affine.for %i0 = 0 to 10 {
+        affine.for %i1 = 0 to 10 {
+          affine.store %cf7, %m[%i0, %i1] : memref<10x10xf32>
+          %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32>
+          %v1 = addf %v0, %v0 : f32
+        }
+      }
+      return %m : memref<10x10xf32>
+    }
+    ```
+
+    Output
+
+    ```mlir
+    module {
+      func @store_load_affine_apply() -> memref<10x10xf32> {
+        %cst = constant 7.000000e+00 : f32
+        %0 = alloc() : memref<10x10xf32>
+        affine.for %arg0 = 0 to 10 {
+          affine.for %arg1 = 0 to 10 {
+            affine.store %cst, %0[%arg0, %arg1] : memref<10x10xf32>
+            %1 = addf %cst, %cst : f32
+          }
+        }
+        return %0 : memref<10x10xf32>
+      }
+    }
+    ```
+  }];
+  let constructor = "mlir::createMemRefDataFlowOptPass()";
+}
+
+def ParallelLoopCollapsing : Pass<"parallel-loop-collapsing"> {
+  let summary = "Collapse parallel loops to use less induction variables";
+  let constructor = "mlir::createParallelLoopCollapsingPass()";
+  let options = [
+    ListOption<"clCollapsedIndices0", "collapsed-indices-0", "unsigned",
+               "Which loop indices to combine 0th loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"clCollapsedIndices1", "collapsed-indices-1", "unsigned",
+               "Which loop indices to combine into the position 1 loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"clCollapsedIndices2", "collapsed-indices-2", "unsigned",
+               "Which loop indices to combine into the position 2 loop index",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+  ];
+}
+
+def PrintCFG : Pass<"print-cfg-graph"> {
+  let summary = "Print CFG graph per-Region";
+  let constructor = "mlir::createPrintCFGGraphPass()";
+}
+
+def PrintOpStats : Pass<"print-op-stats"> {
+  let summary = "Print statistics of operations";
+  let constructor = "mlir::createPrintOpStatsPass()";
+}
+
+def PrintOp : Pass<"print-op-graph"> {
+  let summary = "Print op graph per-Region";
+  let constructor = "mlir::createPrintOpGraphPass()";
+}
+
+def StripDebugInfo : Pass<"strip-debuginfo"> {
+  let summary = "Strip debug info from all operations";
+  let constructor = "mlir::createStripDebugInfoPass()";
+}
+
+def SymbolDCE : Pass<"symbol-dce"> {
+  let summary = "Eliminate dead symbols";
+  let constructor = "mlir::createSymbolDCEPass()";
+}
+
+#endif // MLIR_TRANSFORMS_PASSES
diff --git a/mlir/lib/Analysis/LoopAnalysis.cpp b/mlir/lib/Analysis/LoopAnalysis.cpp
index 9b5725188fae..40e176367d18 100644
--- a/mlir/lib/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Analysis/LoopAnalysis.cpp
@@ -30,9 +30,6 @@ using namespace mlir;
 /// expression is simplified before returning. This method only utilizes map
 /// composition to construct lower and upper bounds before computing the trip
 /// count expressions.
-// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
-// pure analysis method relying on FlatAffineConstraints; the latter will also
-// be more powerful (since both inequalities and equalities will be considered).
 void mlir::buildTripCountMapAndOperands(
     AffineForOp forOp, AffineMap *tripCountMap,
     SmallVectorImpl<Value> *tripCountOperands) {
@@ -270,8 +267,8 @@ static bool isContiguousAccess(Value iv, LoadOrStoreOp memoryOp,
   return true;
 }
 
-template <typename LoadOrStoreOpPointer>
-static bool isVectorElement(LoadOrStoreOpPointer memoryOp) {
+template <typename LoadOrStoreOp>
+static bool isVectorElement(LoadOrStoreOp memoryOp) {
   auto memRefType = memoryOp.getMemRefType();
   return memRefType.getElementType().template isa<VectorType>();
 }
@@ -351,7 +348,7 @@ bool mlir::isVectorizableLoopBody(AffineForOp loop,
 /// 'def' and all its uses have the same shift factor.
 // TODO(mlir-team): extend this to check for memory-based dependence violation
 // when we have the support.
-bool mlir::isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
+bool mlir::isOpwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
   auto *forBody = forOp.getBody();
   assert(shifts.size() == forBody->getOperations().size());
 
diff --git a/mlir/lib/Conversion/AVX512ToLLVM/CMakeLists.txt b/mlir/lib/Conversion/AVX512ToLLVM/CMakeLists.txt
index 5573f6ca1618..9df0d4fde7f1 100644
--- a/mlir/lib/Conversion/AVX512ToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/AVX512ToLLVM/CMakeLists.txt
@@ -3,6 +3,9 @@ add_mlir_conversion_library(MLIRAVX512ToLLVM
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AVX512ToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
 )
 
 set(LIBS
diff --git a/mlir/lib/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.cpp b/mlir/lib/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.cpp
index af29714eb69a..91f3cc933a02 100644
--- a/mlir/lib/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.cpp
+++ b/mlir/lib/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.cpp
@@ -164,6 +164,10 @@ void mlir::populateAVX512ToLLVMConversionPatterns(
 
 namespace {
 struct ConvertAVX512ToLLVMPass : public ModulePass<ConvertAVX512ToLLVMPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertAVX512ToLLVM
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnModule() override;
 };
 } // namespace
@@ -191,7 +195,3 @@ void ConvertAVX512ToLLVMPass::runOnModule() {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createConvertAVX512ToLLVMPass() {
   return std::make_unique<ConvertAVX512ToLLVMPass>();
 }
-
-static PassRegistration<ConvertAVX512ToLLVMPass> pass(
-    "convert-avx512-to-llvm",
-    "Convert the operations from the avx512 dialect into the LLVM dialect");
diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
index ba9c02b452c8..56928da2c633 100644
--- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
+++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -578,6 +578,10 @@ void mlir::populateAffineToStdConversionPatterns(
 
 namespace {
 class LowerAffinePass : public FunctionPass<LowerAffinePass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertAffineToStandard
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     populateAffineToStdConversionPatterns(patterns, &getContext());
@@ -594,7 +598,3 @@ class LowerAffinePass : public FunctionPass<LowerAffinePass> {
 std::unique_ptr<OpPassBase<FuncOp>> mlir::createLowerAffinePass() {
   return std::make_unique<LowerAffinePass>();
 }
-
-static PassRegistration<LowerAffinePass>
-    pass("lower-affine",
-         "Lower affine dialect operations to loop/standard dialect ones");
diff --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
index 5613b28e3418..821a1deb0a15 100644
--- a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
@@ -3,6 +3,9 @@ add_mlir_conversion_library(MLIRAffineToStandard
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AffineToStandard
+
+  DEPENDS
+  MLIRConversionPassIncGen
 )
 target_link_libraries(
   MLIRAffineToStandard
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
index c8f0ded6bc19..a4c98e555322 100644
--- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -15,7 +15,12 @@ if (MLIR_CUDA_CONVERSIONS_ENABLED)
   )
 endif()
 
-add_mlir_conversion_library(MLIRGPUtoCUDATransforms ${SOURCES})
+add_mlir_conversion_library(MLIRGPUtoCUDATransforms
+  ${SOURCES}
+
+  DEPENDS
+  MLIRConversionPassIncGen
+)
 target_link_libraries(MLIRGPUtoCUDATransforms
   PUBLIC
   ${NVPTX_LIBS}
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index 7a275b42df60..38c092a2eaf0 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -63,6 +63,10 @@ namespace {
 class GpuLaunchFuncToCudaCallsPass
     : public ModulePass<GpuLaunchFuncToCudaCallsPass> {
 private:
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertGpuLaunchFuncToCudaCalls
+#include "mlir/Conversion/Passes.h.inc"
+
   LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
 
   llvm::LLVMContext &getLLVMContext() {
@@ -465,7 +469,3 @@ std::unique_ptr<mlir::OpPassBase<mlir::ModuleOp>>
 mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
   return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
 }
-
-static PassRegistration<GpuLaunchFuncToCudaCallsPass>
-    pass("launch-func-to-cuda",
-         "Convert all launch_func ops to CUDA runtime calls");
diff --git a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
index 75d82e80260b..b7c583d57169 100644
--- a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_conversion_library(MLIRGPUtoNVVMTransforms
   LowerGpuOpsToNVVMOps.cpp
 
   DEPENDS
+  MLIRConversionPassIncGen
   MLIRGPUToNVVMIncGen
   )
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 18aeeb845b30..d8297c76ba9a 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -248,6 +248,10 @@ struct GPUReturnOpLowering : public ConvertToLLVMPattern {
 class LowerGpuOpsToNVVMOpsPass
     : public OperationPass<LowerGpuOpsToNVVMOpsPass, gpu::GPUModuleOp> {
 public:
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertGpuOpsToNVVMOps
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
 
@@ -324,6 +328,3 @@ std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
 mlir::createLowerGpuOpsToNVVMOpsPass() {
   return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
 }
-
-static PassRegistration<LowerGpuOpsToNVVMOpsPass>
-    pass("convert-gpu-to-nvvm", "Generate NVVM operations for gpu operations");
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 9acc53a4e788..142734c3ef89 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -1,5 +1,8 @@
 add_mlir_conversion_library(MLIRGPUtoROCDLTransforms
   LowerGpuOpsToROCDLOps.cpp
+
+  DEPENDS
+  MLIRConversionPassIncGen
   )
 target_link_libraries(MLIRGPUtoROCDLTransforms
   PUBLIC
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 79fb3771aff6..b89a1704e9f5 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -34,6 +34,10 @@ namespace {
 class LowerGpuOpsToROCDLOpsPass
     : public OperationPass<LowerGpuOpsToROCDLOpsPass, gpu::GPUModuleOp> {
 public:
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertGpuOpsToROCDLOps
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
 
@@ -84,6 +88,3 @@ mlir::createLowerGpuOpsToROCDLOpsPass() {
   return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
 }
 
-static PassRegistration<LowerGpuOpsToROCDLOpsPass>
-    pass("convert-gpu-to-rocdl",
-         "Generate ROCDL operations for gpu operations");
diff --git a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
index a0c5b6a1c32c..f473a3e04e51 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_conversion_library(MLIRGPUtoSPIRVTransforms
   ConvertGPUToSPIRVPass.cpp
 
   DEPENDS
+  MLIRConversionPassIncGen
   MLIRGPUToSPIRVIncGen
   )
 
diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
index 0a5408e4c8c8..5d380a2f3aa2 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
@@ -349,10 +349,18 @@ LogicalResult GPUFuncOpConversion::matchAndRewrite(
   if (!gpu::GPUDialect::isKernel(funcOp))
     return failure();
 
-  // TODO(antiagainst): we are dictating the ABI by ourselves here; it should be
-  // specified outside.
   SmallVector<spirv::InterfaceVarABIAttr, 4> argABI;
   for (auto argIndex : llvm::seq<unsigned>(0, funcOp.getNumArguments())) {
+    // If the ABI is already specified, use it.
+    auto abiAttr = funcOp.getArgAttrOfType<spirv::InterfaceVarABIAttr>(
+        argIndex, spirv::getInterfaceVarABIAttrName());
+    if (abiAttr) {
+      argABI.push_back(abiAttr);
+      continue;
+    }
+    // todo(ravishankarm): Use the "default ABI". Remove this in a follow up
+    // CL. Staging this to make this easy to revert in case of breakages out of
+    // tree.
     Optional<spirv::StorageClass> sc;
     if (funcOp.getArgument(argIndex).getType().isIntOrIndexOrFloat())
       sc = spirv::StorageClass::StorageBuffer;
diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
index 272eb163ab69..1102ef182c5f 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
@@ -10,6 +10,7 @@
 // into a spv.module operation
 //
 //===----------------------------------------------------------------------===//
+
 #include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h"
 #include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
@@ -19,7 +20,6 @@
 #include "mlir/Dialect/SPIRV/SPIRVLowering.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
 
 using namespace mlir;
 
@@ -34,6 +34,10 @@ namespace {
 ///
 /// 2) Lower the body of the spirv::ModuleOp.
 struct GPUToSPIRVPass : public ModulePass<GPUToSPIRVPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertGpuToSPIRV
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnModule() override;
 };
 } // namespace
@@ -70,6 +74,3 @@ void GPUToSPIRVPass::runOnModule() {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createConvertGPUToSPIRVPass() {
   return std::make_unique<GPUToSPIRVPass>();
 }
-
-static PassRegistration<GPUToSPIRVPass>
-    pass("convert-gpu-to-spirv", "Convert GPU dialect to SPIR-V dialect");
diff --git a/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt b/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
index 847c9c5031e9..ecfc2d75d5f3 100644
--- a/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_mlir_conversion_library(MLIRGPUtoVulkanTransforms
   ConvertLaunchFuncToVulkanCalls.cpp
   ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
+
+  DEPENDS
+  MLIRConversionPassIncGen
   )
 
 target_link_libraries(MLIRGPUtoVulkanTransforms
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
index ad74ad417e52..415df03e620f 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
@@ -40,6 +40,10 @@ namespace {
 class ConvertGpuLaunchFuncToVulkanLaunchFunc
     : public ModulePass<ConvertGpuLaunchFuncToVulkanLaunchFunc> {
 public:
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertGpuLaunchFuncToVulkanLaunchFunc
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnModule() override;
 
 private:
@@ -169,7 +173,3 @@ std::unique_ptr<mlir::OpPassBase<mlir::ModuleOp>>
 mlir::createConvertGpuLaunchFuncToVulkanLaunchFuncPass() {
   return std::make_unique<ConvertGpuLaunchFuncToVulkanLaunchFunc>();
 }
-
-static PassRegistration<ConvertGpuLaunchFuncToVulkanLaunchFunc>
-    pass("convert-gpu-launch-to-vulkan-launch",
-         "Convert gpu.launch_func to vulkanLaunch external call");
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
index d03adc2c64ac..3b5ca17c5f1e 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
@@ -59,6 +59,10 @@ namespace {
 class VulkanLaunchFuncToVulkanCallsPass
     : public ModulePass<VulkanLaunchFuncToVulkanCallsPass> {
 private:
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertVulkanLaunchFuncToVulkanCalls
+#include "mlir/Conversion/Passes.h.inc"
+
   LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
 
   llvm::LLVMContext &getLLVMContext() {
@@ -428,7 +432,3 @@ std::unique_ptr<mlir::OpPassBase<mlir::ModuleOp>>
 mlir::createConvertVulkanLaunchFuncToVulkanCallsPass() {
   return std::make_unique<VulkanLaunchFuncToVulkanCallsPass>();
 }
-
-static PassRegistration<VulkanLaunchFuncToVulkanCallsPass>
-    pass("launch-func-to-vulkan",
-         "Convert vulkanLaunch external call to Vulkan runtime external calls");
diff --git a/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt b/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
index 4f21ca208bfd..72e0966f447b 100644
--- a/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
@@ -3,6 +3,9 @@ add_mlir_conversion_library(MLIRLinalgToLLVM
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LinalgToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
 )
 set(LIBS
   MLIRAffineToStandard
diff --git a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
index 577b134fa5ed..febf4eb74a44 100644
--- a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
+++ b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
@@ -524,12 +524,21 @@ populateLinalgToStandardConversionPatterns(OwningRewritePatternList &patterns,
                                            MLIRContext *ctx) {
   // TODO(ntv) ConvOp conversion needs to export a descriptor with relevant
   // attribute values such as kernel striding and dilation.
-  patterns.insert<CopyTransposeConversion, LinalgOpConversion<ConvOp>,
-                  LinalgOpConversion<CopyOp>, LinalgOpConversion<DotOp>,
-                  LinalgOpConversion<FillOp>, LinalgOpConversion<GenericOp>,
-                  LinalgOpConversion<IndexedGenericOp>,
-                  LinalgOpConversion<MatmulOp>, LinalgOpConversion<MatvecOp>>(
-      ctx);
+  // clang-format off
+  patterns.insert<
+      CopyTransposeConversion,
+      LinalgOpConversion<ConvOp>,
+      LinalgOpConversion<PoolingMaxOp>,
+      LinalgOpConversion<PoolingMinOp>,
+      LinalgOpConversion<PoolingSumOp>,
+      LinalgOpConversion<CopyOp>,
+      LinalgOpConversion<DotOp>,
+      LinalgOpConversion<FillOp>,
+      LinalgOpConversion<GenericOp>,
+      LinalgOpConversion<IndexedGenericOp>,
+      LinalgOpConversion<MatmulOp>,
+      LinalgOpConversion<MatvecOp>>(ctx);
+  // clang-format on
 }
 
 } // namespace
@@ -548,6 +557,10 @@ void mlir::populateLinalgToLLVMConversionPatterns(
 
 namespace {
 struct ConvertLinalgToLLVMPass : public ModulePass<ConvertLinalgToLLVMPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertLinalgToLLVM
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnModule() override;
 };
 } // namespace
@@ -578,7 +591,3 @@ void ConvertLinalgToLLVMPass::runOnModule() {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createConvertLinalgToLLVMPass() {
   return std::make_unique<ConvertLinalgToLLVMPass>();
 }
-
-static PassRegistration<ConvertLinalgToLLVMPass> pass(
-    "convert-linalg-to-llvm",
-    "Convert the operations from the linalg dialect into the LLVM dialect");
diff --git a/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt
index 52a813470f02..f0cc53223366 100644
--- a/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt
@@ -5,6 +5,9 @@ add_mlir_conversion_library(MLIRLinalgToSPIRVTransforms
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+
+  DEPENDS
+  MLIRConversionPassIncGen
   )
 
 target_link_libraries(MLIRLinalgToSPIRVTransforms
diff --git a/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp b/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp
index 4477c070796e..0962746c486a 100644
--- a/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp
@@ -17,6 +17,10 @@ using namespace mlir;
 namespace {
 /// A pass converting MLIR Linalg ops into SPIR-V ops.
 class LinalgToSPIRVPass : public ModulePass<LinalgToSPIRVPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertLinalgToSPIRV
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnModule() override;
 };
 } // namespace
@@ -46,6 +50,3 @@ void LinalgToSPIRVPass::runOnModule() {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLinalgToSPIRVPass() {
   return std::make_unique<LinalgToSPIRVPass>();
 }
-
-static PassRegistration<LinalgToSPIRVPass>
-    pass("convert-linalg-to-spirv", "Convert Linalg ops to SPIR-V ops");
diff --git a/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
index d0401d80fe93..d4c752847583 100644
--- a/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
@@ -3,6 +3,9 @@ add_mlir_conversion_library(MLIRLoopToStandard
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LoopToStandard
+
+  DEPENDS
+  MLIRConversionPassIncGen
 )
 target_link_libraries(
   MLIRLoopToStandard
diff --git a/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp b/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp
index 2bcf32523196..e72c83027611 100644
--- a/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp
+++ b/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp
@@ -31,6 +31,10 @@ using namespace mlir::loop;
 namespace {
 
 struct LoopToStandardPass : public OperationPass<LoopToStandardPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertLoopToStandard
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnOperation() override;
 };
 
@@ -364,7 +368,3 @@ void LoopToStandardPass::runOnOperation() {
 std::unique_ptr<Pass> mlir::createLowerToCFGPass() {
   return std::make_unique<LoopToStandardPass>();
 }
-
-static PassRegistration<LoopToStandardPass>
-    pass("convert-loop-to-std", "Convert Loop dialect to Standard dialect, "
-                                "replacing structured control flow with a CFG");
diff --git a/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
index 9a460bcf7165..2c62755eebc8 100644
--- a/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
+++ b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
@@ -4,6 +4,9 @@ add_mlir_conversion_library(MLIRLoopsToGPU
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LoopsToGPU
+
+  DEPENDS
+  MLIRConversionPassIncGen
 )
 target_link_libraries(MLIRLoopsToGPU
   PUBLIC
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
index 9a5e2a608df9..98a855c105c3 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
@@ -29,6 +29,10 @@ namespace {
 // GPU launch operations.  Nested launches are not allowed, so this does not
 // walk the function recursively to avoid considering nested loops.
 struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertSimpleLoopsToGPU
+#include "mlir/Conversion/Passes.h.inc"
+
   ForLoopMapper() = default;
   ForLoopMapper(const ForLoopMapper &) {}
   ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) {
@@ -50,15 +54,6 @@ struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
         }
       }
   }
-
-  Option<unsigned> numBlockDims{
-      *this, "gpu-block-dims",
-      llvm::cl::desc("Number of GPU block dimensions for mapping"),
-      llvm::cl::init(1u)};
-  Option<unsigned> numThreadDims{
-      *this, "gpu-thread-dims",
-      llvm::cl::desc("Number of GPU thread dimensions for mapping"),
-      llvm::cl::init(1u)};
 };
 
 // A pass that traverses top-level loops in the function and convertes them to
@@ -68,6 +63,10 @@ struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
 // to be perfectly nested upto depth equal to size of `workGroupSize`.
 struct ImperfectlyNestedForLoopMapper
     : public FunctionPass<ImperfectlyNestedForLoopMapper> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertLoopsToGPU
+#include "mlir/Conversion/Passes.h.inc"
+
   ImperfectlyNestedForLoopMapper() = default;
   ImperfectlyNestedForLoopMapper(const ImperfectlyNestedForLoopMapper &) {}
   ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
@@ -103,17 +102,13 @@ struct ImperfectlyNestedForLoopMapper
       }
     }
   }
-  ListOption<int64_t> numWorkGroups{
-      *this, "gpu-num-workgroups",
-      llvm::cl::desc("Num workgroups in the GPU launch"), llvm::cl::ZeroOrMore,
-      llvm::cl::MiscFlags::CommaSeparated};
-  ListOption<int64_t> workGroupSize{
-      *this, "gpu-workgroup-size",
-      llvm::cl::desc("Workgroup Size in the GPU launch"), llvm::cl::ZeroOrMore,
-      llvm::cl::MiscFlags::CommaSeparated};
 };
 
 struct ParallelLoopToGpuPass : public OperationPass<ParallelLoopToGpuPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertParallelLoopToGpu
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnOperation() override {
     OwningRewritePatternList patterns;
     populateParallelLoopToGPUPatterns(patterns, &getContext());
@@ -135,6 +130,9 @@ mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
                                  unsigned numThreadDims) {
   return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
 }
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createSimpleLoopsToGPUPass() {
+  return std::make_unique<ForLoopMapper>();
+}
 
 std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
@@ -142,18 +140,10 @@ mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
   return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
                                                           workGroupSize);
 }
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createLoopToGPUPass() {
+  return std::make_unique<ImperfectlyNestedForLoopMapper>();
+}
 
 std::unique_ptr<Pass> mlir::createParallelLoopToGpuPass() {
   return std::make_unique<ParallelLoopToGpuPass>();
 }
-
-static PassRegistration<ForLoopMapper>
-    registration(PASS_NAME, "Convert top-level loops to GPU kernels");
-
-static PassRegistration<ImperfectlyNestedForLoopMapper>
-    loopOpToGPU(LOOPOP_TO_GPU_PASS_NAME,
-                "Convert top-level loop::ForOp to GPU kernels");
-
-static PassRegistration<ParallelLoopToGpuPass>
-    pass("convert-parallel-loops-to-gpu", "Convert mapped loop.parallel ops"
-                                          " to gpu launch operations.");
diff --git a/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
index 3a5b2f6ba06c..ef7ad11d93ef 100644
--- a/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
@@ -3,6 +3,9 @@ add_mlir_conversion_library(MLIRStandardToLLVM
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/StandardToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
 )
 target_link_libraries(
   MLIRStandardToLLVM
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index 38e4854f9f2f..91c7a249a941 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -2798,6 +2798,10 @@ LLVMTypeConverter::promoteMemRefDescriptors(Location loc, ValueRange opOperands,
 namespace {
 /// A pass converting MLIR operations into the LLVM IR dialect.
 struct LLVMLoweringPass : public ModulePass<LLVMLoweringPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertStandardToLLVM
+#include "mlir/Conversion/Passes.h.inc"
+
   /// Creates an LLVM lowering pass.
   LLVMLoweringPass(bool useAlloca, bool useBarePtrCallConv, bool emitCWrappers,
                    unsigned indexBitwidth) {
@@ -2840,31 +2844,6 @@ struct LLVMLoweringPass : public ModulePass<LLVMLoweringPass> {
       signalPassFailure();
   }
 
-  /// Use `alloca` instead of `call @malloc` for converting std.alloc.
-  Option<bool> useAlloca{
-      *this, "use-alloca",
-      llvm::cl::desc("Replace emission of malloc/free by alloca"),
-      llvm::cl::init(false)};
-
-  /// Convert memrefs to bare pointers in function signatures.
-  Option<bool> useBarePtrCallConv{
-      *this, "use-bare-ptr-memref-call-conv",
-      llvm::cl::desc("Replace FuncOp's MemRef arguments with "
-                     "bare pointers to the MemRef element types"),
-      llvm::cl::init(false)};
-
-  /// Emit wrappers for C-compatible pointer-to-struct memref descriptors.
-  Option<bool> emitCWrappers{
-      *this, "emit-c-wrappers",
-      llvm::cl::desc("Emit C-compatible wrapper functions"),
-      llvm::cl::init(false)};
-
-  /// Configure the bitwidth of the index type when lowered to LLVM.
-  Option<unsigned> indexBitwidth{
-      *this, "index-bitwidth",
-      llvm::cl::desc(
-          "Bitwidth of the index type, 0 to use size of machine word"),
-      llvm::cl::init(kDeriveIndexBitwidthFromDataLayout)};
 };
 } // end namespace
 
@@ -2881,7 +2860,3 @@ mlir::createLowerToLLVMPass(bool useAlloca, bool useBarePtrCallConv,
   return std::make_unique<LLVMLoweringPass>(useAlloca, useBarePtrCallConv,
                                             emitCWrappers, indexBitwidth);
 }
-
-static PassRegistration<LLVMLoweringPass>
-    pass(PASS_NAME, "Convert scalar and vector operations from the "
-                    "Standard to the LLVM dialect");
diff --git a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
index 6d940eaf024e..bb249078d62c 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
@@ -6,6 +6,9 @@ add_mlir_conversion_library(MLIRStandardToSPIRVTransforms
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+
+  DEPENDS
+  MLIRConversionPassIncGen
   )
 
 target_link_libraries(MLIRStandardToSPIRVTransforms
diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp
index efccd168d6ea..ab7dd8546995 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp
@@ -23,6 +23,10 @@ namespace {
 /// A pass converting MLIR Standard operations into the SPIR-V dialect.
 class ConvertStandardToSPIRVPass
     : public ModulePass<ConvertStandardToSPIRVPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertStandardToSPIRV
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnModule() override;
 };
 } // namespace
@@ -48,6 +52,3 @@ void ConvertStandardToSPIRVPass::runOnModule() {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createConvertStandardToSPIRVPass() {
   return std::make_unique<ConvertStandardToSPIRVPass>();
 }
-
-static PassRegistration<ConvertStandardToSPIRVPass>
-    pass("convert-std-to-spirv", "Convert Standard Ops to SPIR-V dialect");
diff --git a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
index 3af01e025801..381087fbdf76 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
@@ -161,6 +161,10 @@ void mlir::populateStdLegalizationPatternsForSPIRVLowering(
 
 namespace {
 struct SPIRVLegalization final : public OperationPass<SPIRVLegalization> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LegalizeStandardForSPIRV
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnOperation() override;
 };
 } // namespace
@@ -175,6 +179,3 @@ void SPIRVLegalization::runOnOperation() {
 std::unique_ptr<Pass> mlir::createLegalizeStdOpsForSPIRVLoweringPass() {
   return std::make_unique<SPIRVLegalization>();
 }
-
-static PassRegistration<SPIRVLegalization>
-    pass("legalize-std-for-spirv", "Legalize standard ops for SPIR-V lowering");
diff --git a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
index 734af0b96a39..ffc6da95da7e 100644
--- a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
@@ -3,6 +3,9 @@ add_mlir_conversion_library(MLIRVectorToLLVM
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
 )
 set(LIBS
   MLIRLLVMIR
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 2b3020d046a2..d5a4f86d2ca9 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1119,6 +1119,10 @@ void mlir::populateVectorToLLVMMatrixConversionPatterns(
 
 namespace {
 struct LowerVectorToLLVMPass : public ModulePass<LowerVectorToLLVMPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ConvertVectorToLLVM
+#include "mlir/Conversion/Passes.h.inc"
+
   void runOnModule() override;
 };
 } // namespace
@@ -1153,7 +1157,3 @@ void LowerVectorToLLVMPass::runOnModule() {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createConvertVectorToLLVMPass() {
   return std::make_unique<LowerVectorToLLVMPass>();
 }
-
-static PassRegistration<LowerVectorToLLVMPass>
-    pass("convert-vector-to-llvm",
-         "Lower the operations from the vector dialect into the LLVM dialect");
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 996efe04066b..a92c621fa96e 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -1383,7 +1383,10 @@ static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) {
   auto prevUbMap = ubMap;
 
   canonicalizeMapAndOperands(&lbMap, &lbOperands);
+  lbMap = removeDuplicateExprs(lbMap);
+
   canonicalizeMapAndOperands(&ubMap, &ubOperands);
+  ubMap = removeDuplicateExprs(ubMap);
 
   // Any canonicalization change always leads to updated map(s).
   if (lbMap == prevLbMap && ubMap == prevUbMap)
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
index 8fc421dada8e..3101a2ec6694 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
@@ -75,6 +75,10 @@ namespace {
 // are strided. Check for strided stores.
 struct AffineDataCopyGeneration
     : public FunctionPass<AffineDataCopyGeneration> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffineDataCopyGeneration
+#include "mlir/Dialect/Affine/Passes.h.inc"
+
   explicit AffineDataCopyGeneration(
       unsigned slowMemorySpace = 0,
       unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0,
@@ -136,6 +140,9 @@ std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass(
       slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
       fastMemCapacityBytes);
 }
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass() {
+  return std::make_unique<AffineDataCopyGeneration>();
+}
 
 /// Generate copies for this block. The block is partitioned into separate
 /// ranges: each range is either a sequence of one or more operations starting
@@ -261,7 +268,3 @@ void AffineDataCopyGeneration::runOnFunction() {
     nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
   }
 }
-
-static PassRegistration<AffineDataCopyGeneration>
-    pass("affine-data-copy-generate",
-         "Generate explicit copying for memory operations");
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
index 746aceb8090a..12fdb37bf841 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
@@ -42,6 +42,10 @@ namespace {
 /// TODO: This code should be removed once the new LICM pass can handle its
 ///       uses.
 struct LoopInvariantCodeMotion : public FunctionPass<LoopInvariantCodeMotion> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffineLoopInvariantCodeMotion
+#include "mlir/Dialect/Affine/Passes.h.inc"
+
   void runOnFunction() override;
   void runOnAffineForOp(AffineForOp forOp);
 };
@@ -232,7 +236,3 @@ std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createAffineLoopInvariantCodeMotionPass() {
   return std::make_unique<LoopInvariantCodeMotion>();
 }
-
-static PassRegistration<LoopInvariantCodeMotion>
-    pass("affine-loop-invariant-code-motion",
-         "Hoist loop invariant instructions outside of the loop");
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
index 89abbd521dec..833736965776 100644
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_dialect_library(MLIRAffineTransforms
 
   DEPENDS
   MLIRAffineOpsIncGen
+  MLIRAffinePassIncGen
   MLIRLoopLikeInterfaceIncGen
   )
 target_link_libraries(MLIRAffineTransforms
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
index 7568098530f7..2f5eea7606a9 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
@@ -59,6 +59,10 @@ namespace {
 
 /// A pass to perform loop tiling on all suitable loop nests of a Function.
 struct LoopTiling : public FunctionPass<LoopTiling> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffineLoopTiling
+#include "mlir/Dialect/Affine/Passes.h.inc"
+
   explicit LoopTiling(uint64_t cacheSizeBytes = kDefaultCacheMemCapacity,
                       bool avoidMaxMinBounds = true)
       : cacheSizeBytes(cacheSizeBytes), avoidMaxMinBounds(avoidMaxMinBounds) {}
@@ -85,6 +89,9 @@ std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createLoopTilingPass(uint64_t cacheSizeBytes) {
   return std::make_unique<LoopTiling>(cacheSizeBytes);
 }
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createLoopTilingPass() {
+  return std::make_unique<LoopTiling>();
+}
 
 // Move the loop body of AffineForOp 'src' from 'src' into the specified
 // location in destination's body, ignoring the terminator.
@@ -420,5 +427,3 @@ void LoopTiling::runOnFunction() {
 
 constexpr unsigned LoopTiling::kDefaultTileSize;
 constexpr uint64_t LoopTiling::kDefaultCacheMemCapacity;
-
-static PassRegistration<LoopTiling> pass("affine-loop-tile", "Tile loop nests");
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
index 40187a7ebae7..7f33630d3b8a 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -50,11 +50,19 @@ static llvm::cl::opt<unsigned> clUnrollFullThreshold(
     llvm::cl::cat(clOptionsCategory));
 
 namespace {
+
+// TODO: this is really a test pass and should be moved out of dialect
+// transforms.
+
 /// Loop unrolling pass. Unrolls all innermost loops unless full unrolling and a
 /// full unroll threshold was specified, in which case, fully unrolls all loops
 /// with trip count less than the specified threshold. The latter is for testing
 /// purposes, especially for testing outer loop unrolling.
 struct LoopUnroll : public FunctionPass<LoopUnroll> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffineUnroll
+#include "mlir/Dialect/Affine/Passes.h.inc"
+
   const Optional<unsigned> unrollFactor;
   const Optional<bool> unrollFull;
   // Callback to obtain unroll factors; if this has a callable target, takes
@@ -76,44 +84,32 @@ struct LoopUnroll : public FunctionPass<LoopUnroll> {
 };
 } // end anonymous namespace
 
-void LoopUnroll::runOnFunction() {
-  // Gathers all innermost loops through a post order pruned walk.
-  struct InnermostLoopGatherer {
-    // Store innermost loops as we walk.
-    std::vector<AffineForOp> loops;
-
-    void walkPostOrder(FuncOp f) {
-      for (auto &b : f)
-        walkPostOrder(b.begin(), b.end());
-    }
-
-    bool walkPostOrder(Block::iterator Start, Block::iterator End) {
-      bool hasInnerLoops = false;
-      // We need to walk all elements since all innermost loops need to be
-      // gathered as opposed to determining whether this list has any inner
-      // loops or not.
-      while (Start != End)
-        hasInnerLoops |= walkPostOrder(&(*Start++));
-      return hasInnerLoops;
-    }
-    bool walkPostOrder(Operation *opInst) {
-      bool hasInnerLoops = false;
-      for (auto &region : opInst->getRegions())
-        for (auto &block : region)
-          hasInnerLoops |= walkPostOrder(block.begin(), block.end());
-      if (isa<AffineForOp>(opInst)) {
-        if (!hasInnerLoops)
-          loops.push_back(cast<AffineForOp>(opInst));
-        return true;
-      }
-      return hasInnerLoops;
-    }
-  };
+/// Returns true if no other affine.for ops are nested within.
+static bool isInnermostAffineForOp(AffineForOp forOp) {
+  // Only for the innermost affine.for op's.
+  bool isInnermost = true;
+  forOp.walk([&](AffineForOp thisForOp) {
+    // Since this is a post order walk, we are able to conclude here.
+    isInnermost = (thisForOp == forOp);
+    return WalkResult::interrupt();
+  });
+  return isInnermost;
+}
 
+/// Gathers loops that have no affine.for's nested within.
+static void gatherInnermostLoops(FuncOp f,
+                                 SmallVectorImpl<AffineForOp> &loops) {
+  f.walk([&](AffineForOp forOp) {
+    if (isInnermostAffineForOp(forOp))
+      loops.push_back(forOp);
+  });
+}
+
+void LoopUnroll::runOnFunction() {
   if (clUnrollFull.getNumOccurrences() > 0 &&
       clUnrollFullThreshold.getNumOccurrences() > 0) {
     // Store short loops as we walk.
-    std::vector<AffineForOp> loops;
+    SmallVector<AffineForOp, 4> loops;
 
     // Gathers all loops with trip count <= minTripCount. Do a post order walk
     // so that loops are gathered from innermost to outermost (or else unrolling
@@ -133,10 +129,10 @@ void LoopUnroll::runOnFunction() {
                                 : 1;
   // If the call back is provided, we will recurse until no loops are found.
   FuncOp func = getFunction();
+  SmallVector<AffineForOp, 4> loops;
   for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
-    InnermostLoopGatherer ilg;
-    ilg.walkPostOrder(func);
-    auto &loops = ilg.loops;
+    loops.clear();
+    gatherInnermostLoops(func, loops);
     if (loops.empty())
       break;
     bool unrolled = false;
@@ -177,5 +173,3 @@ std::unique_ptr<OpPassBase<FuncOp>> mlir::createLoopUnrollPass(
       unrollFactor == -1 ? None : Optional<unsigned>(unrollFactor),
       unrollFull == -1 ? None : Optional<bool>(unrollFull), getUnrollFactor);
 }
-
-static PassRegistration<LoopUnroll> pass("affine-loop-unroll", "Unroll loops");
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
index 218a6a2ce23d..5f419a83d6cf 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
@@ -61,6 +61,10 @@ namespace {
 /// Loop unroll jam pass. Currently, this just unroll jams the first
 /// outer loop in a Function.
 struct LoopUnrollAndJam : public FunctionPass<LoopUnrollAndJam> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffineLoopUnrollAndJam
+#include "mlir/Dialect/Affine/Passes.h.inc"
+
   Optional<unsigned> unrollJamFactor;
   static const unsigned kDefaultUnrollJamFactor = 4;
 
@@ -100,6 +104,3 @@ LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) {
   // Unroll and jam by four otherwise.
   return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor);
 }
-
-static PassRegistration<LoopUnrollAndJam> pass("affine-loop-unroll-jam",
-                                               "Unroll and jam loops");
diff --git a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
index 34de759a7290..fc58b19656fe 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
@@ -28,6 +28,10 @@ namespace {
 /// identity layout ones.
 struct SimplifyAffineStructures
     : public FunctionPass<SimplifyAffineStructures> {
+/// Include the generated pass utilities.
+#define GEN_PASS_SimplifyAffineStructures
+#include "mlir/Dialect/Affine/Passes.h.inc"
+
   void runOnFunction() override;
 
   /// Utility to simplify an affine attribute and update its entry in the parent
@@ -94,7 +98,3 @@ void SimplifyAffineStructures::runOnFunction() {
     normalizeMemRef(allocOp);
   }
 }
-
-static PassRegistration<SimplifyAffineStructures>
-    pass("simplify-affine-structures",
-         "Simplify affine expressions in maps/sets and normalize memrefs");
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index d9a9ad969e51..395945982225 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -574,29 +574,14 @@ namespace {
 /// Base state for the vectorize pass.
 /// Command line arguments are preempted by non-empty pass arguments.
 struct Vectorize : public FunctionPass<Vectorize> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffineVectorize
+#include "mlir/Dialect/Affine/Passes.h.inc"
+
   Vectorize() = default;
   Vectorize(const Vectorize &) {}
   Vectorize(ArrayRef<int64_t> virtualVectorSize);
   void runOnFunction() override;
-
-  /// The virtual vector size that we vectorize to.
-  ListOption<int64_t> vectorSizes{
-      *this, "virtual-vector-size",
-      llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
-      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated};
-  /// Optionally, the fixed mapping from loop to fastest varying MemRef
-  /// dimension for all the MemRefs within a loop pattern:
-  ///   the index represents the loop depth, the value represents the k^th
-  ///   fastest varying memory dimension.
-  /// This is voluntarily restrictive and is meant to precisely target a
-  /// particular loop/op pair, for testing purposes.
-  ListOption<int64_t> fastestVaryingPattern{
-      *this, "test-fastest-varying",
-      llvm::cl::desc(
-          "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory"
-          " dimensions to match. See defaultPatterns in Vectorize.cpp for a"
-          " description and examples. This is used for testing purposes"),
-      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated};
 };
 
 } // end anonymous namespace
@@ -1271,7 +1256,6 @@ std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
   return std::make_unique<Vectorize>(virtualVectorSize);
 }
-
-static PassRegistration<Vectorize>
-    pass("affine-super-vectorize",
-         "Vectorize to a target independent n-D vector abstraction");
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createSuperVectorizePass() {
+  return std::make_unique<Vectorize>();
+}
diff --git a/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt b/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt
index e18ce1a4cfde..751db7af86f7 100644
--- a/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRFxpMathOps
 
   DEPENDS
   MLIRFxpMathOpsIncGen
+  MLIRFxpMathPassIncGen
   )
 
 target_link_libraries(MLIRFxpMathOps
diff --git a/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp b/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp
index 2a876a332ea6..4136566201bd 100644
--- a/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp
+++ b/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp
@@ -21,13 +21,20 @@ using namespace mlir::fxpmath::detail;
 using namespace mlir::quant;
 
 namespace {
-
 struct LowerUniformRealMathPass
     : public FunctionPass<LowerUniformRealMathPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_FxpMathLowerUniformRealMath
+#include "mlir/Dialect/FxpMathOps/Passes.h.inc"
+
   void runOnFunction() override;
 };
 
 struct LowerUniformCastsPass : public FunctionPass<LowerUniformCastsPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_FxpMathLowerUniformCasts
+#include "mlir/Dialect/FxpMathOps/Passes.h.inc"
+
   void runOnFunction() override;
 };
 
@@ -364,14 +371,11 @@ void LowerUniformRealMathPass::runOnFunction() {
   applyPatternsGreedily(fn, patterns);
 }
 
-OpPassBase<FuncOp> *mlir::fxpmath::createLowerUniformRealMathPass() {
-  return new LowerUniformRealMathPass();
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::fxpmath::createLowerUniformRealMathPass() {
+  return std::make_unique<LowerUniformRealMathPass>();
 }
 
-static PassRegistration<LowerUniformRealMathPass> lowerUniformRealMathPass(
-    "fxpmath-lower-uniform-real-math",
-    "Lowers uniform-quantized real math ops to integer arithmetic.");
-
 //===----------------------------------------------------------------------===//
 // LowerUniformCasts pass
 //===----------------------------------------------------------------------===//
@@ -384,10 +388,7 @@ void LowerUniformCastsPass::runOnFunction() {
   applyPatternsGreedily(fn, patterns);
 }
 
-OpPassBase<FuncOp> *mlir::fxpmath::createLowerUniformCastsPass() {
-  return new LowerUniformCastsPass();
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::fxpmath::createLowerUniformCastsPass() {
+  return std::make_unique<LowerUniformCastsPass>();
 }
-
-static PassRegistration<LowerUniformCastsPass>
-    lowerUniformCastsPass("fxpmath-lower-uniform-casts",
-                          "Lowers uniform-quantized casts.");
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index e71a018a451c..ad63b3669409 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRGPU
 
   DEPENDS
   MLIRGPUOpsIncGen
+  MLIRGPUPassIncGen
   MLIRParallelLoopMapperAttrGen
   MLIRParallelLoopMapperEnumsGen
   )
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 99029b010ba3..d20f23de78ec 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -205,7 +205,6 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
 }
 
 namespace {
-
 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
 ///
 /// This pass moves the kernel code of each LaunchOp into a function created
@@ -217,6 +216,10 @@ namespace {
 /// symbol of the cubin accessor function.
 class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
 public:
+/// Include the generated pass utilities.
+#define GEN_PASS_GpuKernelOutlining
+#include "mlir/Dialect/GPU/Passes.h.inc"
+
   void runOnModule() override {
     SymbolTable symbolTable(getModule());
     bool modified = false;
@@ -300,7 +303,3 @@ class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createGpuKernelOutliningPass() {
   return std::make_unique<GpuKernelOutliningPass>();
 }
-
-static PassRegistration<GpuKernelOutliningPass>
-    pass("gpu-kernel-outlining",
-         "Outline gpu.launch bodies to kernel functions.");
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index 6b37b060de6c..216586e6242d 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -1,5 +1,8 @@
 add_mlir_dialect_library(MLIRLLVMIRTransforms
   LegalizeForExport.cpp
+
+  DEPENDS
+  MLIRLLVMPassIncGen
   )
 
 target_link_libraries(MLIRLLVMIRTransforms
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp
index fce4045f1aa3..663f0c11432b 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp
@@ -58,6 +58,10 @@ void mlir::LLVM::ensureDistinctSuccessors(Operation *op) {
 
 namespace {
 struct LegalizeForExportPass : public OperationPass<LegalizeForExportPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LLVMLegalizeForExport
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
+
   void runOnOperation() override {
     LLVM::ensureDistinctSuccessors(getOperation());
   }
@@ -67,7 +71,3 @@ struct LegalizeForExportPass : public OperationPass<LegalizeForExportPass> {
 std::unique_ptr<Pass> LLVM::createLegalizeForExportPass() {
   return std::make_unique<LegalizeForExportPass>();
 }
-
-static PassRegistration<LegalizeForExportPass>
-    pass("llvm-legalize-for-export",
-         "Legalize LLVM dialect to be convertible to LLVM IR");
diff --git a/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp b/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
index bf52039a6dc1..90ce8fd6bb0b 100644
--- a/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
+++ b/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
@@ -24,24 +24,6 @@ using namespace mlir::linalg;
 
 using llvm::dbgs;
 
-#ifndef NDEBUG
-static StringRef toStringRef(LinalgDependenceGraph::DependenceType dt) {
-  switch (dt) {
-  case LinalgDependenceGraph::DependenceType::RAW:
-    return "RAW";
-  case LinalgDependenceGraph::DependenceType::RAR:
-    return "RAR";
-  case LinalgDependenceGraph::DependenceType::WAR:
-    return "WAR";
-  case LinalgDependenceGraph::DependenceType::WAW:
-    return "WAW";
-  default:
-    break;
-  }
-  llvm_unreachable("Unexpected DependenceType");
-}
-#endif
-
 Value Aliases::find(Value v) {
   if (v.isa<BlockArgument>())
     return v;
@@ -76,6 +58,22 @@ Value Aliases::find(Value v) {
   }
 }
 
+StringRef LinalgDependenceGraph::getDependenceTypeStr(DependenceType depType) {
+  switch (depType) {
+  case LinalgDependenceGraph::DependenceType::RAW:
+    return "RAW";
+  case LinalgDependenceGraph::DependenceType::RAR:
+    return "RAR";
+  case LinalgDependenceGraph::DependenceType::WAR:
+    return "WAR";
+  case LinalgDependenceGraph::DependenceType::WAW:
+    return "WAW";
+  default:
+    break;
+  }
+  llvm_unreachable("Unexpected DependenceType");
+}
+
 LinalgDependenceGraph
 LinalgDependenceGraph::buildDependenceGraph(Aliases &aliases, FuncOp f) {
   SmallVector<Operation *, 8> linalgOps;
@@ -100,7 +98,7 @@ LinalgDependenceGraph::LinalgDependenceGraph(Aliases &aliases,
 void LinalgDependenceGraph::addDependenceElem(DependenceType dt,
                                               LinalgOpView indexingOpView,
                                               LinalgOpView dependentOpView) {
-  LLVM_DEBUG(dbgs() << "\nAdd dep type " << toStringRef(dt) << ":\t"
+  LLVM_DEBUG(dbgs() << "\nAdd dep type " << getDependenceTypeStr(dt) << ":\t"
                     << *indexingOpView.op << " -> " << *dependentOpView.op);
   dependencesFromGraphs[dt][indexingOpView.op].push_back(
       LinalgDependenceGraphElem{dependentOpView, indexingOpView.view});
@@ -227,8 +225,8 @@ LinalgDependenceGraph::findOperationsWithCoveringDependences(
         continue;
       auto *op = dependence.dependentOpView.op;
       LLVM_DEBUG(dbgs() << "\n***Found covering dependence of type "
-                        << toStringRef(dt) << ": " << *src << " -> " << *op
-                        << " on " << dependence.indexingView);
+                        << getDependenceTypeStr(dt) << ": " << *src << " -> "
+                        << *op << " on " << dependence.indexingView);
       res.push_back(op);
     }
   }
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index aa340e55e8b5..077b34c320ec 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -140,7 +140,6 @@ static void printGenericOp(OpAsmPrinter &p, GenericOpType op) {
     p.printRegion(op.region());
   p.printOptionalAttrDict(op.getAttrs(), attrNames);
   p << ": " << op.getOperandTypes();
-
   auto outputTensorTypes = op.getResultTypes();
   if (!outputTensorTypes.empty())
     p << " -> " << outputTensorTypes;
@@ -827,8 +826,10 @@ static LogicalResult verify(CopyOp op) {
   return success();
 }
 
-static LogicalResult
-verifyStrideOrDilation(ConvOp op, ArrayRef<Attribute> attrs, bool isStride) {
+template <typename LinalgPoolingOp>
+static LogicalResult verifyStrideOrDilation(LinalgPoolingOp op,
+                                            ArrayRef<Attribute> attrs,
+                                            bool isStride) {
   auto strideOrDilation = isStride ? "stride" : "dilation";
   if (attrs.size() != op.getNumWindowLoops())
     return op.emitOpError("expects num ")
@@ -860,6 +861,41 @@ static LogicalResult verify(ConvOp op) {
   return success();
 }
 
+template <typename PoolingOp>
+LogicalResult verifySingleInputPoolingOp(PoolingOp op) {
+  auto inputType = op.input().getType().template cast<MemRefType>();
+  auto outputType = op.output().getType().template cast<MemRefType>();
+  if (outputType.getElementType() != inputType.getElementType())
+    return op.emitOpError("expects memref elemental types to match");
+
+  auto windowDimsType = op.windowDims().getType().template cast<MemRefType>();
+  if (outputType.getRank() != inputType.getRank() ||
+      outputType.getRank() != windowDimsType.getRank())
+    return op.emitOpError("expects memref ranks to match");
+
+  if (auto strides = op.strides()) {
+    if (failed(
+            verifyStrideOrDilation(op, strides->getValue(), /*isStride=*/true)))
+      return failure();
+  }
+  if (auto dilations = op.dilations()) {
+    if (failed(verifyStrideOrDilation(op, dilations->getValue(),
+                                      /*isStride=*/false)))
+      return failure();
+  }
+  return success();
+}
+
+static LogicalResult verify(PoolingMaxOp op) {
+  return verifySingleInputPoolingOp(op);
+}
+static LogicalResult verify(PoolingMinOp op) {
+  return verifySingleInputPoolingOp(op);
+}
+static LogicalResult verify(PoolingSumOp op) {
+  return verifySingleInputPoolingOp(op);
+}
+
 namespace mlir {
 namespace linalg {
 
@@ -894,21 +930,34 @@ mlir::linalg::makeAffineDimExprs(unsigned num, unsigned &startIdx,
   return res;
 }
 
+template <typename PoolingOp>
 SmallVector<AffineExpr, 4>
-mlir::linalg::weightedConvInputIndex(ConvOp op, ArrayRef<AffineExpr> xs,
-                                     ArrayRef<AffineExpr> zs) {
-  assert(xs.size() == zs.size());
+mlir::linalg::weightedPoolingInputIndex(PoolingOp op,
+                                        ArrayRef<AffineExpr> outputDims,
+                                        ArrayRef<AffineExpr> windowDims) {
+  assert(outputDims.size() == windowDims.size());
   SmallVector<AffineExpr, 4> res;
-  res.reserve(xs.size());
-  for (unsigned i = 0, e = xs.size(); i < e; ++i) {
+  res.reserve(outputDims.size());
+  for (unsigned i = 0, e = outputDims.size(); i < e; ++i) {
     // TODO(ntv): add a level of indirection to linalg.generic.
-    auto expr =
-        op.getStride(i) * xs[i] + op.getDilation(i) * zs[i] - op.getLowPad(i);
+    auto expr = op.getStride(i) * outputDims[i] +
+                op.getDilation(i) * windowDims[i] - op.getLowPad(i);
     res.push_back(expr);
   }
   return res;
 }
 
+#define INSTANTIATE_WEIGHTED_POOLING_INPUT_INDEX(OP_TYPE)                      \
+  template SmallVector<AffineExpr, 4>                                          \
+  mlir::linalg::weightedPoolingInputIndex<OP_TYPE>(                            \
+      OP_TYPE op, ArrayRef<AffineExpr> outputDims,                             \
+      ArrayRef<AffineExpr> windowDims);
+
+INSTANTIATE_WEIGHTED_POOLING_INPUT_INDEX(ConvOp)
+INSTANTIATE_WEIGHTED_POOLING_INPUT_INDEX(PoolingMaxOp)
+INSTANTIATE_WEIGHTED_POOLING_INPUT_INDEX(PoolingMinOp)
+INSTANTIATE_WEIGHTED_POOLING_INPUT_INDEX(PoolingSumOp)
+
 SmallVector<AffineExpr, 4> mlir::linalg::concat(ArrayRef<AffineExpr> a,
                                                 ArrayRef<AffineExpr> b) {
   auto rangeA = llvm::make_range(a.begin(), a.end());
@@ -959,6 +1008,18 @@ LogicalResult ConvOp::fold(ArrayRef<Attribute>,
                            SmallVectorImpl<OpFoldResult> &) {
   return foldMemRefCast(*this);
 }
+LogicalResult PoolingMaxOp::fold(ArrayRef<Attribute>,
+                                 SmallVectorImpl<OpFoldResult> &) {
+  return foldMemRefCast(*this);
+}
+LogicalResult PoolingMinOp::fold(ArrayRef<Attribute>,
+                                 SmallVectorImpl<OpFoldResult> &) {
+  return foldMemRefCast(*this);
+}
+LogicalResult PoolingSumOp::fold(ArrayRef<Attribute>,
+                                 SmallVectorImpl<OpFoldResult> &) {
+  return foldMemRefCast(*this);
+}
 LogicalResult CopyOp::fold(ArrayRef<Attribute>,
                            SmallVectorImpl<OpFoldResult> &) {
   return foldMemRefCast(*this);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 349c2d6980ac..93f7142ce2df 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
 
   DEPENDS
   intrinsics_gen
+  MLIRLinalgPassIncGen
   MLIRLinalgTransformPatternsIncGen
   )
 target_link_libraries(MLIRLinalgTransforms
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
index b6af16c979c3..d3c56851a385 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -157,9 +157,9 @@ static LinalgOp fuse(Value producedView, LinalgOp producer, LinalgOp consumer,
   }
 
   auto subView = dyn_cast_or_null<SubViewOp>(
-      consumer.getInput(consumerIdx).getDefiningOp());
-  auto slice =
-      dyn_cast_or_null<SliceOp>(consumer.getInput(consumerIdx).getDefiningOp());
+      consumer.getBuffer(consumerIdx).getDefiningOp());
+  auto slice = dyn_cast_or_null<SliceOp>(
+      consumer.getBuffer(consumerIdx).getDefiningOp());
   assert(subView || slice);
   (void)subView;
   (void)slice;
@@ -274,16 +274,15 @@ bool mlir::linalg::isFusableInto(const LinalgDependenceGraph &graph,
   return true;
 }
 
-// Only consider RAW atm.
-Optional<FusionInfo> mlir::linalg::fuseProducerOf(
-    OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
-    const LinalgDependenceGraph &graph, OperationFolder *folder) {
+static Optional<FusionInfo>
+fuseProducerOfDep(OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
+                  const LinalgDependenceGraph &graph, OperationFolder *folder,
+                  LinalgDependenceGraph::DependenceType depType) {
   assert(consumer.hasBufferSemantics() &&
          "expected linalg op with buffer semantics");
   LLVM_DEBUG(dbgs() << "\nStart examining consumer: "
                     << *consumer.getOperation());
-  for (auto dependence : graph.getDependencesInto(
-           consumer, LinalgDependenceGraph::DependenceType::RAW)) {
+  for (auto dependence : graph.getDependencesInto(consumer, depType)) {
     LLVM_DEBUG(dbgs() << "\n***Consider producer:\t"
                       << *dependence.dependentOpView.op << "\n");
     auto producer = cast<LinalgOp>(dependence.dependentOpView.op);
@@ -294,7 +293,7 @@ Optional<FusionInfo> mlir::linalg::fuseProducerOf(
 
     // Check that the dependence is indeed on the input `consumerIdx` view.
     auto consumedView = dependence.indexingView;
-    if (consumer.getInput(consumerIdx) != consumedView)
+    if (consumer.getBuffer(consumerIdx) != consumedView)
       continue;
 
     // Consumer consumes this view, `isStructurallyFusableProducer` also checks
@@ -302,9 +301,10 @@ Optional<FusionInfo> mlir::linalg::fuseProducerOf(
     auto producedView = dependence.dependentOpView.view;
     auto producerIdx = producer.getIndexOfOutputBuffer(producedView).getValue();
     // `consumerIdx` and `producerIdx` exist by construction.
-    LLVM_DEBUG(dbgs() << "\nRAW producer: " << *producer.getOperation()
-                      << " view: " << producedView
-                      << " output index: " << producerIdx);
+    LLVM_DEBUG(dbgs() << "\n"
+                      << LinalgDependenceGraph::getDependenceTypeStr(depType)
+                      << "producer: " << *producer.getOperation() << " view: "
+                      << producedView << " output index: " << producerIdx);
 
     // Must be a subview or a slice to guarantee there are loops we can fuse
     // into.
@@ -332,6 +332,22 @@ Optional<FusionInfo> mlir::linalg::fuseProducerOf(
   return llvm::None;
 }
 
+// Only consider RAW and WAW atm.
+Optional<FusionInfo> mlir::linalg::fuseProducerOf(
+    OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
+    const LinalgDependenceGraph &graph, OperationFolder *folder) {
+  SmallVector<LinalgDependenceGraph::DependenceType, 4> deps = {
+      LinalgDependenceGraph::DependenceType::RAW,
+      LinalgDependenceGraph::DependenceType::WAW,
+  };
+  for (auto dep : deps) {
+    if (auto res =
+            fuseProducerOfDep(b, consumer, consumerIdx, graph, folder, dep))
+      return res;
+  }
+  return llvm::None;
+}
+
 /// Checks if two Generic ops are fusible, when one is a producer and another is
 /// a consumer (with the result of the producer being the `consumerIdx` operand
 /// of the consumer).
@@ -498,7 +514,8 @@ static void fuseLinalgOpsGreedily(FuncOp f) {
   // The current naive and expensive reconstruction of the graph should be
   // removed.
   for (auto *op : llvm::reverse(linalgOps)) {
-    for (unsigned id = 0, e = LinalgOp(op).getNumInputs(); id < e; ++id) {
+    for (unsigned id = 0, e = LinalgOp(op).getNumInputsAndOutputBuffers();
+         id < e; ++id) {
       linalg::Aliases aliases;
       linalg::LinalgDependenceGraph graph(aliases, linalgOps);
       if (auto info = fuseProducerOf(b, op, id, graph, &folder)) {
@@ -551,6 +568,10 @@ struct FuseGenericTensorOps : public OpRewritePattern<GenericOp> {
 
 /// Pass that fuses generic ops on tensors. Used only for testing.
 struct FusionOfTensorOpsPass : public OperationPass<FusionOfTensorOpsPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgFusionOfTensorOps
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+
   void runOnOperation() override {
     OwningRewritePatternList patterns;
     Operation *op = getOperation();
@@ -560,6 +581,10 @@ struct FusionOfTensorOpsPass : public OperationPass<FusionOfTensorOpsPass> {
 };
 
 struct LinalgFusionPass : public FunctionPass<LinalgFusionPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgFusion
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+
   void runOnFunction() override { fuseLinalgOpsGreedily(getFunction()); }
 };
 } // namespace
@@ -568,9 +593,6 @@ std::unique_ptr<OpPassBase<FuncOp>> mlir::createLinalgFusionPass() {
   return std::make_unique<LinalgFusionPass>();
 }
 
-static PassRegistration<LinalgFusionPass>
-    pass("linalg-fusion", "Fuse operations in the linalg dialect");
-
-static PassRegistration<FusionOfTensorOpsPass>
-    tensorOpsPass("linalg-fusion-for-tensor-ops",
-                  "Fuse operations on RankedTensorType in linalg dialect");
+std::unique_ptr<Pass> mlir::createLinalgFusionOfTensorOpsPass() {
+  return std::make_unique<FusionOfTensorOpsPass>();
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
index eb2a881308c4..4e26080eec53 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
@@ -106,6 +106,23 @@ static void inlineRegionAndEmitStdStore(OpType op,
   }
 }
 
+// Returns a pair that contains input indices and output indices of a
+// SingleInputPoolingOp `op`.
+template <typename SingleInputPoolingOp>
+static std::pair<SmallVector<ValueHandle, 8>, SmallVector<ValueHandle, 8>>
+getInputAndOutputIndices(ArrayRef<Value> allIvs, SingleInputPoolingOp op) {
+  auto &b = ScopedContext::getBuilder();
+  auto loc = ScopedContext::getLocation();
+  auto mapsRange = op.indexing_maps().template getAsRange<AffineMapAttr>();
+  auto maps =
+      functional::map([](AffineMapAttr a) { return a.getValue(); }, mapsRange);
+  SmallVector<ValueHandle, 8> iIdx(
+      makeCanonicalAffineApplies(b, loc, maps[0], allIvs));
+  SmallVector<ValueHandle, 8> oIdx(
+      makeCanonicalAffineApplies(b, loc, maps[2], allIvs));
+  return {iIdx, oIdx};
+}
+
 namespace {
 template <typename IndexedValueType, typename LinalgOpType>
 class LinalgScopedEmitter {};
@@ -273,6 +290,57 @@ class LinalgScopedEmitter<IndexedValueType, ConvOp> {
   }
 };
 
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, PoolingMaxOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs,
+                                       PoolingMaxOp op) {
+    auto indices = getInputAndOutputIndices(allIvs, op);
+    ValueHandleArray iIdx(indices.first);
+    ValueHandleArray oIdx(indices.second);
+
+    // Emit scalar form.
+    ValueHandle lhs = std_load(op.output(), oIdx);
+    ValueHandle rhs = std_load(op.input(), iIdx);
+    using edsc::op::operator>;
+    ValueHandle maxValue = std_select(lhs > rhs, lhs, rhs);
+    std_store(maxValue, op.output(), oIdx);
+  }
+};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, PoolingMinOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs,
+                                       PoolingMinOp op) {
+    auto indices = getInputAndOutputIndices(allIvs, op);
+    ValueHandleArray iIdx(indices.first);
+    ValueHandleArray oIdx(indices.second);
+
+    // Emit scalar form.
+    ValueHandle lhs = std_load(op.output(), oIdx);
+    ValueHandle rhs = std_load(op.input(), iIdx);
+    using edsc::op::operator<;
+    ValueHandle minValue = std_select(lhs < rhs, lhs, rhs);
+    std_store(minValue, op.output(), oIdx);
+  }
+};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, PoolingSumOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs,
+                                       PoolingSumOp op) {
+    auto indices = getInputAndOutputIndices(allIvs, op);
+    SmallVector<ValueHandle, 8> iIdx = indices.first;
+    SmallVector<ValueHandle, 8> oIdx = indices.second;
+    IndexedValueType input(op.input()), output(op.output());
+
+    // Emit scalar form.
+    output(oIdx) += input(iIdx);
+  }
+};
+
 // Emits the MLIR for the scalar part of the generic op by:
 //   1. Emitting std_load and std_store ops for each input and output
 //      view in order. This is achieved by applying the appropriate input or
@@ -572,14 +640,6 @@ void FillRewritePatterns(OwningRewritePatternList &patterns, MLIRContext *ctx) {
                      >::build(patterns, ctx);
 }
 
-namespace {
-template <typename LoopType, typename IndexedValueType>
-struct LowerLinalgToLoopsPass
-    : public FunctionPass<LowerLinalgToLoopsPass<LoopType, IndexedValueType>> {
-  void runOnFunction() override;
-};
-} // namespace
-
 // Local folding pattern for AffineApplyOp that we can apply greedily.
 // This replaces AffineApplyOp by the proper value in cases where the associated
 // map is trivial. A trivial map here is defined as a map with a single result
@@ -619,8 +679,7 @@ struct FoldAffineOp : public RewritePattern {
 } // namespace
 
 template <typename LoopType, typename IndexedValueType>
-void LowerLinalgToLoopsPass<LoopType, IndexedValueType>::runOnFunction() {
-  auto *context = &this->getContext();
+static void lowerLinalgToLoopsImpl(Operation *op, MLIRContext *context) {
   OwningRewritePatternList patterns;
   // Canonicalization and folding patterns applied greedily allow cleaning up
   // the emitted IR on the fly.
@@ -630,24 +689,54 @@ void LowerLinalgToLoopsPass<LoopType, IndexedValueType>::runOnFunction() {
   AffineApplyOp::getCanonicalizationPatterns(patterns, context);
   patterns.insert<FoldAffineOp>(context);
   // Just apply the patterns greedily.
-  applyPatternsGreedily(this->getFunction(), patterns);
+  applyPatternsGreedily(op, patterns);
 }
 
+namespace {
+struct LowerToAffineLoops : public FunctionPass<LowerToAffineLoops> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgLowerToAffineLoops
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+
+  void runOnFunction() override {
+    lowerLinalgToLoopsImpl<AffineForOp, AffineIndexedValue>(getFunction(),
+                                                            &getContext());
+  }
+};
+struct LowerToLoops : public FunctionPass<LowerToLoops> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgLowerToLoops
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+
+  void runOnFunction() override {
+    lowerLinalgToLoopsImpl<loop::ForOp, StdIndexedValue>(getFunction(),
+                                                         &getContext());
+  }
+};
+struct LowerToParallelLoops : public FunctionPass<LowerToParallelLoops> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgLowerToParallelLoops
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+
+  void runOnFunction() override {
+    lowerLinalgToLoopsImpl<loop::ParallelOp, StdIndexedValue>(getFunction(),
+                                                              &getContext());
+  }
+};
+} // namespace
+
 std::unique_ptr<OpPassBase<FuncOp>> mlir::createConvertLinalgToLoopsPass() {
-  return std::make_unique<
-      LowerLinalgToLoopsPass<loop::ForOp, StdIndexedValue>>();
+  return std::make_unique<LowerToLoops>();
 }
 
 std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createConvertLinalgToParallelLoopsPass() {
-  return std::make_unique<
-      LowerLinalgToLoopsPass<loop::ParallelOp, StdIndexedValue>>();
+  return std::make_unique<LowerToParallelLoops>();
 }
 
 std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createConvertLinalgToAffineLoopsPass() {
-  return std::make_unique<
-      LowerLinalgToLoopsPass<AffineForOp, AffineIndexedValue>>();
+  return std::make_unique<LowerToAffineLoops>();
 }
 
 /// Emits a loop nest of `loop.for` with the proper body for `op`.
@@ -688,6 +777,9 @@ INSTANTIATE_LINALG_OP_TO_LOOPS(DotOp)
 INSTANTIATE_LINALG_OP_TO_LOOPS(MatvecOp)
 INSTANTIATE_LINALG_OP_TO_LOOPS(MatmulOp)
 INSTANTIATE_LINALG_OP_TO_LOOPS(ConvOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(PoolingMaxOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(PoolingMinOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(PoolingSumOp)
 INSTANTIATE_LINALG_OP_TO_LOOPS(GenericOp)
 INSTANTIATE_LINALG_OP_TO_LOOPS(IndexedGenericOp)
 
@@ -696,19 +788,3 @@ INSTANTIATE_LINALG_OP_TO_LOOPS(IndexedGenericOp)
 template LogicalResult
 mlir::linalg::linalgOpToParallelLoops<GenericOp>(PatternRewriter &rewriter,
                                                  Operation *op);
-
-static PassRegistration<LowerLinalgToLoopsPass<loop::ForOp, StdIndexedValue>>
-    structuredLoopsPass(
-        "convert-linalg-to-loops",
-        "Lower the operations from the linalg dialect into loops");
-
-static PassRegistration<
-    LowerLinalgToLoopsPass<loop::ParallelOp, StdIndexedValue>>
-    parallelLoopsPass(
-        "convert-linalg-to-parallel-loops",
-        "Lower the operations from the linalg dialect into parallel loops");
-
-static PassRegistration<LowerLinalgToLoopsPass<AffineForOp, AffineIndexedValue>>
-    affineLoopsPass(
-        "convert-linalg-to-affine-loops",
-        "Lower the operations from the linalg dialect into affine loops");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index 8a6b5cf8b5da..6603507a1cdf 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -231,6 +231,10 @@ static void promoteSubViews(FuncOp f, bool dynamicBuffers) {
 
 namespace {
 struct LinalgPromotionPass : public FunctionPass<LinalgPromotionPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgPromotion
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+
   LinalgPromotionPass() = default;
   LinalgPromotionPass(const LinalgPromotionPass &) {}
   LinalgPromotionPass(bool dynamicBuffers) {
@@ -240,11 +244,6 @@ struct LinalgPromotionPass : public FunctionPass<LinalgPromotionPass> {
   void runOnFunction() override {
     promoteSubViews(getFunction(), dynamicBuffers);
   }
-
-  Option<bool> dynamicBuffers{
-      *this, "test-promote-dynamic",
-      llvm::cl::desc("Test generation of dynamic promoted buffers"),
-      llvm::cl::init(false)};
 };
 } // namespace
 
@@ -252,6 +251,6 @@ std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createLinalgPromotionPass(bool dynamicBuffers) {
   return std::make_unique<LinalgPromotionPass>(dynamicBuffers);
 }
-
-static PassRegistration<LinalgPromotionPass>
-    pass("linalg-promote-subviews", "promote subview ops to local buffers");
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createLinalgPromotionPass() {
+  return std::make_unique<LinalgPromotionPass>();
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 14253e3ebd5d..919a7d53f479 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -507,41 +507,47 @@ static void tileLinalgOps(FuncOp f, ArrayRef<int64_t> tileSizes) {
 }
 
 namespace {
+struct LinalgTilingPass : public FunctionPass<LinalgTilingPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgTiling
+#include "mlir/Dialect/Linalg/Passes.h.inc"
 
-template <typename LoopTy>
-struct LinalgTilingPass : public FunctionPass<LinalgTilingPass<LoopTy>> {
   LinalgTilingPass() = default;
   LinalgTilingPass(const LinalgTilingPass &) {}
   LinalgTilingPass(ArrayRef<int64_t> sizes) {
-    this->tileSizes->assign(sizes.begin(), sizes.end());
+    tileSizes->assign(sizes.begin(), sizes.end());
   }
 
   void runOnFunction() override {
-    tileLinalgOps<LoopTy>(this->getFunction(), tileSizes);
+    tileLinalgOps<loop::ForOp>(getFunction(), tileSizes);
+  }
+};
+
+struct LinalgTilingToParallelLoopsPass
+    : public FunctionPass<LinalgTilingToParallelLoopsPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LinalgTilingToParallelLoops
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+
+  LinalgTilingToParallelLoopsPass() = default;
+  LinalgTilingToParallelLoopsPass(const LinalgTilingToParallelLoopsPass &) {}
+  LinalgTilingToParallelLoopsPass(ArrayRef<int64_t> sizes) {
+    tileSizes->assign(sizes.begin(), sizes.end());
   }
 
-  Pass::ListOption<int64_t> tileSizes{
-      *this, "linalg-tile-sizes",
-      llvm::cl::desc("Tile sizes by which to tile linalg operations"),
-      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
+  void runOnFunction() override {
+    tileLinalgOps<loop::ParallelOp>(getFunction(), tileSizes);
+  }
 };
 
 } // namespace
 
 std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createLinalgTilingPass(ArrayRef<int64_t> tileSizes) {
-  return std::make_unique<LinalgTilingPass<loop::ForOp>>(tileSizes);
+  return std::make_unique<LinalgTilingPass>(tileSizes);
 }
 
 std::unique_ptr<OpPassBase<FuncOp>>
 mlir::createLinalgTilingToParallelLoopsPass(ArrayRef<int64_t> tileSizes) {
-  return std::make_unique<LinalgTilingPass<loop::ParallelOp>>(tileSizes);
+  return std::make_unique<LinalgTilingToParallelLoopsPass>(tileSizes);
 }
-
-static PassRegistration<LinalgTilingPass<loop::ForOp>>
-    tiling_pass("linalg-tile", "Tile operations in the linalg dialect");
-
-static PassRegistration<LinalgTilingPass<loop::ParallelOp>>
-    tiling_to_parallel_loops(
-        "linalg-tile-to-parallel-loops",
-        "Tile operations in the linalg dialect to parallel loops");
diff --git a/mlir/lib/Dialect/LoopOps/LoopOps.cpp b/mlir/lib/Dialect/LoopOps/LoopOps.cpp
index 3d7ee3846c22..c62507fa8aaf 100644
--- a/mlir/lib/Dialect/LoopOps/LoopOps.cpp
+++ b/mlir/lib/Dialect/LoopOps/LoopOps.cpp
@@ -182,7 +182,7 @@ bool ForOp::isDefinedOutsideOfLoop(Value value) {
 
 LogicalResult ForOp::moveOutOfLoop(ArrayRef<Operation *> ops) {
   for (auto op : ops)
-    op->moveBefore(this->getOperation());
+    op->moveBefore(*this);
   return success();
 }
 
@@ -191,8 +191,8 @@ ForOp mlir::loop::getForInductionVarOwner(Value val) {
   if (!ivArg)
     return ForOp();
   assert(ivArg.getOwner() && "unlinked block argument");
-  auto *containingInst = ivArg.getOwner()->getParentOp();
-  return dyn_cast_or_null<ForOp>(containingInst);
+  auto *containingOp = ivArg.getOwner()->getParentOp();
+  return dyn_cast_or_null<ForOp>(containingOp);
 }
 
 //===----------------------------------------------------------------------===//
@@ -459,13 +459,25 @@ static void print(OpAsmPrinter &p, ParallelOp op) {
       op.getAttrs(), /*elidedAttrs=*/ParallelOp::getOperandSegmentSizeAttr());
 }
 
+Region &ParallelOp::getLoopBody() { return region(); }
+
+bool ParallelOp::isDefinedOutsideOfLoop(Value value) {
+  return !region().isAncestor(value.getParentRegion());
+}
+
+LogicalResult ParallelOp::moveOutOfLoop(ArrayRef<Operation *> ops) {
+  for (auto op : ops)
+    op->moveBefore(*this);
+  return success();
+}
+
 ParallelOp mlir::loop::getParallelForInductionVarOwner(Value val) {
   auto ivArg = val.dyn_cast<BlockArgument>();
   if (!ivArg)
     return ParallelOp();
   assert(ivArg.getOwner() && "unlinked block argument");
-  auto *containingInst = ivArg.getOwner()->getParentOp();
-  return dyn_cast<ParallelOp>(containingInst);
+  auto *containingOp = ivArg.getOwner()->getParentOp();
+  return dyn_cast<ParallelOp>(containingOp);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
index 2ec44b472298..e3ec12b4b21d 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
@@ -5,6 +5,9 @@ add_mlir_dialect_library(MLIRLoopOpsTransforms
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LoopOps
+
+  DEPENDS
+  MLIRLoopPassIncGen
   )
 target_link_libraries(MLIRLoopOpsTransforms
   PUBLIC
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopFusion.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopFusion.cpp
index b84cfa5aa78c..b59f1fc3c8c9 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopFusion.cpp
+++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopFusion.cpp
@@ -160,8 +160,11 @@ void mlir::loop::naivelyFuseParallelOps(Region &region) {
 }
 
 namespace {
-
 struct ParallelLoopFusion : public OperationPass<ParallelLoopFusion> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LoopParallelLoopFusion
+#include "mlir/Dialect/LoopOps/Passes.h.inc"
+
   void runOnOperation() override {
     for (Region &region : getOperation()->getRegions())
       naivelyFuseParallelOps(region);
@@ -173,6 +176,3 @@ struct ParallelLoopFusion : public OperationPass<ParallelLoopFusion> {
 std::unique_ptr<Pass> mlir::createParallelLoopFusionPass() {
   return std::make_unique<ParallelLoopFusion>();
 }
-
-static PassRegistration<ParallelLoopFusion>
-    pass("parallel-loop-fusion", "Fuse adjacent parallel loops.");
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
index c692c0174f0c..471d5e53e7d0 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
+++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
@@ -61,6 +61,10 @@ static void specializeLoopForUnrolling(ParallelOp op) {
 namespace {
 struct ParallelLoopSpecialization
     : public FunctionPass<ParallelLoopSpecialization> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LoopParallelLoopSpecialization
+#include "mlir/Dialect/LoopOps/Passes.h.inc"
+
   void runOnFunction() override {
     getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); });
   }
@@ -70,7 +74,3 @@ struct ParallelLoopSpecialization
 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() {
   return std::make_unique<ParallelLoopSpecialization>();
 }
-
-static PassRegistration<ParallelLoopSpecialization>
-    pass("parallel-loop-specialization",
-         "Specialize parallel loops for vectorization.");
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
index 85fd241cee7e..8b63d0090291 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
+++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
@@ -102,8 +102,12 @@ static bool getInnermostNestedLoops(Block *block,
 
 namespace {
 struct ParallelLoopTiling : public FunctionPass<ParallelLoopTiling> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LoopParallelLoopTiling
+#include "mlir/Dialect/LoopOps/Passes.h.inc"
+
   ParallelLoopTiling() = default;
-  ParallelLoopTiling(const ParallelLoopTiling &) {} // tileSize is non-copyable.
+  ParallelLoopTiling(const ParallelLoopTiling &) {}
   explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
     this->tileSizes = tileSizes;
   }
@@ -117,11 +121,6 @@ struct ParallelLoopTiling : public FunctionPass<ParallelLoopTiling> {
       tileParallelLoop(pLoop, tileSizes);
     }
   }
-
-  ListOption<int64_t> tileSizes{
-      *this, "parallel-loop-tile-sizes",
-      llvm::cl::desc("factors to tile parallel loops by"), llvm::cl::ZeroOrMore,
-      llvm::cl::MiscFlags::CommaSeparated};
 };
 } // namespace
 
@@ -129,6 +128,3 @@ std::unique_ptr<Pass>
 mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes) {
   return std::make_unique<ParallelLoopTiling>(tileSizes);
 }
-
-static PassRegistration<ParallelLoopTiling> pass("parallel-loop-tiling",
-                                                 "Tile parallel loops.");
diff --git a/mlir/lib/Dialect/Quant/CMakeLists.txt b/mlir/lib/Dialect/Quant/CMakeLists.txt
index 4796889f8cd8..4f8382962206 100644
--- a/mlir/lib/Dialect/Quant/CMakeLists.txt
+++ b/mlir/lib/Dialect/Quant/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_dialect_library(MLIRQuant
 
   DEPENDS
   MLIRQuantOpsIncGen
+  MLIRQuantPassIncGen
   )
 target_link_libraries(MLIRQuant
   PUBLIC
diff --git a/mlir/lib/Dialect/Quant/Transforms/ConvertConst.cpp b/mlir/lib/Dialect/Quant/Transforms/ConvertConst.cpp
index 2598e8cf5013..3017346a9acc 100644
--- a/mlir/lib/Dialect/Quant/Transforms/ConvertConst.cpp
+++ b/mlir/lib/Dialect/Quant/Transforms/ConvertConst.cpp
@@ -21,9 +21,11 @@ using namespace mlir;
 using namespace mlir::quant;
 
 namespace {
+struct ConvertConstPass : public FunctionPass<ConvertConstPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_QuantConvertConst
+#include "mlir/Dialect/Quant/Passes.h.inc"
 
-class ConvertConstPass : public FunctionPass<ConvertConstPass> {
-public:
   void runOnFunction() override;
 };
 
@@ -106,7 +108,3 @@ void ConvertConstPass::runOnFunction() {
 std::unique_ptr<OpPassBase<FuncOp>> mlir::quant::createConvertConstPass() {
   return std::make_unique<ConvertConstPass>();
 }
-
-static PassRegistration<ConvertConstPass>
-    pass("quant-convert-const",
-         "Converts constants followed by qbarrier to actual quantized values");
diff --git a/mlir/lib/Dialect/Quant/Transforms/ConvertSimQuant.cpp b/mlir/lib/Dialect/Quant/Transforms/ConvertSimQuant.cpp
index c921aeafda90..b76cee6a412c 100644
--- a/mlir/lib/Dialect/Quant/Transforms/ConvertSimQuant.cpp
+++ b/mlir/lib/Dialect/Quant/Transforms/ConvertSimQuant.cpp
@@ -19,10 +19,12 @@ using namespace mlir;
 using namespace mlir::quant;
 
 namespace {
-
-class ConvertSimulatedQuantPass
+struct ConvertSimulatedQuantPass
     : public FunctionPass<ConvertSimulatedQuantPass> {
-public:
+/// Include the generated pass utilities.
+#define GEN_PASS_QuantConvertSimulatedQuant
+#include "mlir/Dialect/Quant/Passes.h.inc"
+
   void runOnFunction() override;
 };
 
@@ -142,8 +144,3 @@ std::unique_ptr<OpPassBase<FuncOp>>
 mlir::quant::createConvertSimulatedQuantPass() {
   return std::make_unique<ConvertSimulatedQuantPass>();
 }
-
-static PassRegistration<ConvertSimulatedQuantPass>
-    pass("quant-convert-simulated-quantization",
-         "Converts training-time simulated quantization ops to corresponding "
-         "quantize/dequantize casts.");
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
index f6b862156c49..841b5e0ace64 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
@@ -2076,7 +2076,7 @@ void spirv::LoopOp::addEntryAndMergeBlock() {
   body().push_back(new Block());
   auto *mergeBlock = new Block();
   body().push_back(mergeBlock);
-  OpBuilder builder(mergeBlock);
+  OpBuilder builder = OpBuilder::atBlockEnd(mergeBlock);
 
   // Add a spv._merge op into the merge block.
   builder.create<spirv::MergeOp>(getLoc());
@@ -2373,7 +2373,7 @@ void spirv::SelectionOp::addMergeBlock() {
   assert(body().empty() && "entry and merge block already exist");
   auto *mergeBlock = new Block();
   body().push_back(mergeBlock);
-  OpBuilder builder(mergeBlock);
+  OpBuilder builder = OpBuilder::atBlockEnd(mergeBlock);
 
   // Add a spv._merge op into the merge block.
   builder.create<spirv::MergeOp>(getLoc());
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
index f31c4836e7f7..e388069d52ee 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
@@ -5,6 +5,9 @@ add_mlir_dialect_library(MLIRSPIRVTransforms
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV
+
+  DEPENDS
+  MLIRSPIRVPassIncGen
   )
 
 target_link_libraries(MLIRSPIRVTransforms
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
index 0645408398b6..79ed81956f08 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
@@ -117,7 +117,3 @@ std::unique_ptr<OpPassBase<ModuleOp>>
 mlir::spirv::createDecorateSPIRVCompositeTypeLayoutPass() {
   return std::make_unique<DecorateSPIRVCompositeTypeLayoutPass>();
 }
-
-static PassRegistration<DecorateSPIRVCompositeTypeLayoutPass>
-    pass("decorate-spirv-composite-type-layout",
-         "Decorate SPIR-V composite type with layout info");
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
index 1ca9cad977af..9cb2bfe1e1fc 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
@@ -264,6 +264,3 @@ std::unique_ptr<OpPassBase<spirv::ModuleOp>>
 mlir::spirv::createLowerABIAttributesPass() {
   return std::make_unique<LowerABIAttributesPass>();
 }
-
-static PassRegistration<LowerABIAttributesPass>
-    pass("spirv-lower-abi-attrs", "Lower SPIR-V ABI Attributes");
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp
index 201adbbd3837..ebb5b6eda83a 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp
@@ -177,8 +177,3 @@ std::unique_ptr<OpPassBase<spirv::ModuleOp>>
 mlir::spirv::createUpdateVersionCapabilityExtensionPass() {
   return std::make_unique<UpdateVCEPass>();
 }
-
-static PassRegistration<UpdateVCEPass>
-    pass("spirv-update-vce",
-         "Deduce and attach minimal (version, capabilities, extensions) "
-         "requirements to spv.module ops");
diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp
index 816aaf9f5948..5084bc6ece0c 100644
--- a/mlir/lib/Dialect/Vector/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/VectorOps.cpp
@@ -1524,6 +1524,23 @@ static LogicalResult verify(TupleOp op) { return success(); }
 // TransposeOp
 //===----------------------------------------------------------------------===//
 
+// Eliminates transpose operations, which produce values identical to their
+// input values. This happens when the dimensions of the input vector remain in
+// their original order after the transpose operation.
+OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
+  SmallVector<int64_t, 4> transp;
+  getTransp(transp);
+
+  // Check if the permutation of the dimensions contains sequential values:
+  // {0, 1, 2, ...}.
+  for (int64_t i = 0, e = transp.size(); i < e; i++) {
+    if (transp[i] != i)
+      return {};
+  }
+
+  return vector();
+}
+
 static LogicalResult verify(TransposeOp op) {
   VectorType vectorType = op.getVectorType();
   VectorType resultType = op.getResultType();
@@ -1549,6 +1566,10 @@ static LogicalResult verify(TransposeOp op) {
   return success();
 }
 
+void TransposeOp::getTransp(SmallVectorImpl<int64_t> &results) {
+  populateFromInt64AttrArray(transp(), results);
+}
+
 //===----------------------------------------------------------------------===//
 // TupleGetOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index 22bedb77f10e..7e5036e37bab 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -325,6 +325,15 @@ AffineMap mlir::simplifyAffineMap(AffineMap map) {
   return AffineMap::get(map.getNumDims(), map.getNumSymbols(), exprs);
 }
 
+AffineMap mlir::removeDuplicateExprs(AffineMap map) {
+  auto results = map.getResults();
+  SmallVector<AffineExpr, 4> uniqueExprs(results.begin(), results.end());
+  uniqueExprs.erase(std::unique(uniqueExprs.begin(), uniqueExprs.end()),
+                    uniqueExprs.end());
+  return AffineMap::get(map.getNumDims(), map.getNumSymbols(), uniqueExprs,
+                        map.getContext());
+}
+
 AffineMap mlir::inversePermutation(AffineMap map) {
   if (map.isEmpty())
     return map;
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index df4ea9b7d16a..3875b57775bc 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -630,7 +630,7 @@ AffineMap AffineMap::get(MLIRContext *context) {
 
 AffineMap AffineMap::get(unsigned dimCount, unsigned symbolCount,
                          MLIRContext *context) {
-  return getImpl(dimCount, /*symbolCount=*/0, /*results=*/{}, context);
+  return getImpl(dimCount, symbolCount, /*results=*/{}, context);
 }
 
 AffineMap AffineMap::get(unsigned dimCount, unsigned symbolCount,
@@ -640,6 +640,11 @@ AffineMap AffineMap::get(unsigned dimCount, unsigned symbolCount,
   return getImpl(dimCount, symbolCount, results, results[0].getContext());
 }
 
+AffineMap AffineMap::get(unsigned dimCount, unsigned symbolCount,
+                         ArrayRef<AffineExpr> results, MLIRContext *context) {
+  return getImpl(dimCount, symbolCount, results, context);
+}
+
 //===----------------------------------------------------------------------===//
 // Integer Sets: these are allocated into the bump pointer, and are immutable.
 // Unlike AffineMap's, these are uniqued only if they are small.
diff --git a/mlir/lib/Pass/PassRegistry.cpp b/mlir/lib/Pass/PassRegistry.cpp
index 0ecf0266eb5a..74ac36ae8c59 100644
--- a/mlir/lib/Pass/PassRegistry.cpp
+++ b/mlir/lib/Pass/PassRegistry.cpp
@@ -96,8 +96,9 @@ PassInfo::PassInfo(StringRef arg, StringRef description, const PassID *passID,
           }) {}
 
 void mlir::registerPass(StringRef arg, StringRef description,
-                        const PassID *passID,
                         const PassAllocatorFunction &function) {
+  // TODO: We should use the 'arg' as the lookup key instead of the pass id.
+  const PassID *passID = function()->getPassID();
   PassInfo passInfo(arg, description, passID, function);
   bool inserted = passRegistry->try_emplace(passID, passInfo).second;
   assert(inserted && "Pass registered multiple times");
diff --git a/mlir/lib/Quantizer/CMakeLists.txt b/mlir/lib/Quantizer/CMakeLists.txt
index 27950c232ec1..15459304d535 100644
--- a/mlir/lib/Quantizer/CMakeLists.txt
+++ b/mlir/lib/Quantizer/CMakeLists.txt
@@ -48,6 +48,9 @@ add_mlir_library(MLIRQuantizerTransforms
   Transforms/RemoveInstrumentationPass.cpp
 
   ADDITIONAL_HEADER_DIRS
+
+  DEPENDS
+  MLIRQuantizerPassIncGen
   )
 target_link_libraries(MLIRQuantizerTransforms
   PUBLIC
diff --git a/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp b/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp
index 7a9627dd144e..238b0e850b88 100644
--- a/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp
+++ b/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp
@@ -29,9 +29,12 @@ using namespace mlir::quantizer;
 using namespace mlir::quant;
 
 namespace {
-
 class AddDefaultStatsPass : public FunctionPass<AddDefaultStatsPass> {
 public:
+/// Include the generated pass utilities.
+#define GEN_PASS_QuantizerAddDefaultStats
+#include "mlir/Quantizer/Transforms/Passes.h.inc"
+
   AddDefaultStatsPass() = default;
   AddDefaultStatsPass(SolverContext &solverContext,
                       const TargetConfiguration &config)
@@ -113,8 +116,3 @@ std::unique_ptr<OpPassBase<FuncOp>>
 mlir::quantizer::createAddDefaultStatsPass() {
   return std::make_unique<AddDefaultStatsPass>();
 }
-
-static PassRegistration<AddDefaultStatsPass> pass(
-    "quantizer-add-default-stats-test",
-    "Adds default (dummy) statistics to all ops that can benefit from "
-    "runtime statistics. This is meant to help in early stage bootstrapping.");
diff --git a/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp b/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp
index 1876b33b8421..49d98ef695c0 100644
--- a/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp
+++ b/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp
@@ -71,13 +71,17 @@ struct DOTGraphTraits<const CAGSlice *>
 } // end namespace llvm
 
 namespace {
-
 class InferQuantizedTypesPass : public ModulePass<InferQuantizedTypesPass> {
 public:
+/// Include the generated pass utilities.
+#define GEN_PASS_QuantizerInferQuantizedTypes
+#include "mlir/Quantizer/Transforms/Passes.h.inc"
+
   InferQuantizedTypesPass() = default;
   InferQuantizedTypesPass(SolverContext &solverContext,
                           const TargetConfiguration &config)
       : explicitSolverContext(&solverContext), explicitConfig(&config) {}
+
   void runOnModule() override;
   void runWithConfig(SolverContext &solverContext,
                      const TargetConfiguration &config);
@@ -282,12 +286,7 @@ mlir::quantizer::createInferQuantizedTypesPass(
     SolverContext &solverContext, const TargetConfiguration &config) {
   return std::make_unique<InferQuantizedTypesPass>(solverContext, config);
 }
-void mlir::quantizer::registerInferQuantizedTypesPass() {
-  // Do nothing, this will be enough to force link this file and the static
-  // registration will kick-in. This is temporary while we're refactoring pass
-  // registration to move away from static constructors.
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::quantizer::createInferQuantizedTypesPass() {
+  return std::make_unique<InferQuantizedTypesPass>();
 }
-
-static PassRegistration<InferQuantizedTypesPass>
-    pass("quantizer-infer-quantized-types",
-         "Infers quantized types for a module");
diff --git a/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp b/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp
index 0112753c49fc..6c348d392254 100644
--- a/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp
+++ b/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp
@@ -23,9 +23,12 @@ using namespace mlir::quantizer;
 using namespace mlir::quant;
 
 namespace {
-
 class RemoveInstrumentationPass
     : public FunctionPass<RemoveInstrumentationPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_QuantizerRemoveInstrumentation
+#include "mlir/Quantizer/Transforms/Passes.h.inc"
+
   void runOnFunction() override;
 };
 
@@ -61,8 +64,3 @@ std::unique_ptr<OpPassBase<FuncOp>>
 mlir::quantizer::createRemoveInstrumentationPass() {
   return std::make_unique<RemoveInstrumentationPass>();
 }
-
-static PassRegistration<RemoveInstrumentationPass>
-    pass("quantizer-remove-instrumentation",
-         "Removes instrumentation and hints which have no effect on final "
-         "execution");
diff --git a/mlir/lib/TableGen/CMakeLists.txt b/mlir/lib/TableGen/CMakeLists.txt
index 4c6ac720f0ea..08384657f94f 100644
--- a/mlir/lib/TableGen/CMakeLists.txt
+++ b/mlir/lib/TableGen/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMMLIRTableGen
   OpClass.cpp
   OpInterfaces.cpp
   OpTrait.cpp
+  Pass.cpp
   Pattern.cpp
   Predicate.cpp
   SideEffects.cpp
diff --git a/mlir/lib/TableGen/Pass.cpp b/mlir/lib/TableGen/Pass.cpp
new file mode 100644
index 000000000000..63f31c99bc90
--- /dev/null
+++ b/mlir/lib/TableGen/Pass.cpp
@@ -0,0 +1,90 @@
+//===- Pass.cpp - Pass related classes ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Pass.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+//===----------------------------------------------------------------------===//
+// PassOption
+//===----------------------------------------------------------------------===//
+
+StringRef PassOption::getCppVariableName() const {
+  return def->getValueAsString("cppName");
+}
+
+StringRef PassOption::getArgument() const {
+  return def->getValueAsString("argument");
+}
+
+StringRef PassOption::getType() const { return def->getValueAsString("type"); }
+
+Optional<StringRef> PassOption::getDefaultValue() const {
+  StringRef defaultVal = def->getValueAsString("defaultValue");
+  return defaultVal.empty() ? Optional<StringRef>() : defaultVal;
+}
+
+StringRef PassOption::getDescription() const {
+  return def->getValueAsString("description");
+}
+
+Optional<StringRef> PassOption::getAdditionalFlags() const {
+  StringRef additionalFlags = def->getValueAsString("additionalOptFlags");
+  return additionalFlags.empty() ? Optional<StringRef>() : additionalFlags;
+}
+
+bool PassOption::isListOption() const {
+  return def->isSubClassOf("ListOption");
+}
+
+//===----------------------------------------------------------------------===//
+// PassStatistic
+//===----------------------------------------------------------------------===//
+
+StringRef PassStatistic::getCppVariableName() const {
+  return def->getValueAsString("cppName");
+}
+
+StringRef PassStatistic::getName() const {
+  return def->getValueAsString("name");
+}
+
+StringRef PassStatistic::getDescription() const {
+  return def->getValueAsString("description");
+}
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+Pass::Pass(const llvm::Record *def) : def(def) {
+  for (auto *init : def->getValueAsListOfDefs("options"))
+    options.push_back(PassOption(init));
+  for (auto *init : def->getValueAsListOfDefs("statistics"))
+    statistics.push_back(PassStatistic(init));
+}
+
+StringRef Pass::getArgument() const {
+  return def->getValueAsString("argument");
+}
+
+StringRef Pass::getSummary() const { return def->getValueAsString("summary"); }
+
+StringRef Pass::getDescription() const {
+  return def->getValueAsString("description");
+}
+
+StringRef Pass::getConstructor() const {
+  return def->getValueAsString("constructor");
+}
+
+ArrayRef<PassOption> Pass::getOptions() const { return options; }
+
+ArrayRef<PassStatistic> Pass::getStatistics() const { return statistics; }
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index acb0a5b8b9a1..49704229ad14 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -23,6 +23,7 @@ add_mlir_library(MLIRTransforms
 
   DEPENDS
   MLIRStandardOpsIncGen
+  MLIRTransformsPassIncGen
   )
 
 target_link_libraries(MLIRTransforms
diff --git a/mlir/lib/Transforms/CSE.cpp b/mlir/lib/Transforms/CSE.cpp
index 42ba7153f5cd..67890f63dcd9 100644
--- a/mlir/lib/Transforms/CSE.cpp
+++ b/mlir/lib/Transforms/CSE.cpp
@@ -74,6 +74,10 @@ struct SimpleOperationInfo : public llvm::DenseMapInfo<Operation *> {
 namespace {
 /// Simple common sub-expression elimination.
 struct CSE : public OperationPass<CSE> {
+/// Include the generated pass utilities.
+#define GEN_PASS_CSE
+#include "mlir/Transforms/Passes.h.inc"
+
   CSE() = default;
   CSE(const CSE &) {}
 
@@ -114,10 +118,6 @@ struct CSE : public OperationPass<CSE> {
 private:
   /// Operations marked as dead and to be erased.
   std::vector<Operation *> opsToErase;
-
-  /// Statistics for CSE.
-  Statistic numCSE{this, "num-cse'd", "Number of operations CSE'd"};
-  Statistic numDCE{this, "num-dce'd", "Number of operations trivially DCE'd"};
 };
 } // end anonymous namespace
 
@@ -262,5 +262,3 @@ void CSE::runOnOperation() {
 }
 
 std::unique_ptr<Pass> mlir::createCSEPass() { return std::make_unique<CSE>(); }
-
-static PassRegistration<CSE> pass("cse", "Eliminate common sub-expressions");
diff --git a/mlir/lib/Transforms/Canonicalizer.cpp b/mlir/lib/Transforms/Canonicalizer.cpp
index 1f02917c2b6e..964fc7f66500 100644
--- a/mlir/lib/Transforms/Canonicalizer.cpp
+++ b/mlir/lib/Transforms/Canonicalizer.cpp
@@ -20,6 +20,10 @@ using namespace mlir;
 namespace {
 /// Canonicalize operations in nested regions.
 struct Canonicalizer : public OperationPass<Canonicalizer> {
+/// Include the generated pass utilities.
+#define GEN_PASS_Canonicalizer
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnOperation() override {
     OwningRewritePatternList patterns;
 
@@ -40,6 +44,3 @@ struct Canonicalizer : public OperationPass<Canonicalizer> {
 std::unique_ptr<Pass> mlir::createCanonicalizerPass() {
   return std::make_unique<Canonicalizer>();
 }
-
-static PassRegistration<Canonicalizer> pass("canonicalize",
-                                            "Canonicalize operations");
diff --git a/mlir/lib/Transforms/Inliner.cpp b/mlir/lib/Transforms/Inliner.cpp
index ea48582dc52a..04ae30faf2aa 100644
--- a/mlir/lib/Transforms/Inliner.cpp
+++ b/mlir/lib/Transforms/Inliner.cpp
@@ -590,6 +590,10 @@ static void inlineSCC(Inliner &inliner, CGUseList &useList,
 
 namespace {
 struct InlinerPass : public OperationPass<InlinerPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_Inliner
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnOperation() override {
     CallGraph &cg = getAnalysis<CallGraph>();
     auto *context = &getContext();
@@ -622,5 +626,3 @@ struct InlinerPass : public OperationPass<InlinerPass> {
 std::unique_ptr<Pass> mlir::createInlinerPass() {
   return std::make_unique<InlinerPass>();
 }
-
-static PassRegistration<InlinerPass> pass("inline", "Inline function calls");
diff --git a/mlir/lib/Transforms/LocationSnapshot.cpp b/mlir/lib/Transforms/LocationSnapshot.cpp
index 3d1ce828f1f2..4f54e13e76fd 100644
--- a/mlir/lib/Transforms/LocationSnapshot.cpp
+++ b/mlir/lib/Transforms/LocationSnapshot.cpp
@@ -123,8 +123,11 @@ LogicalResult mlir::generateLocationsFromIR(StringRef fileName, StringRef tag,
 }
 
 namespace {
-class LocationSnapshotPass : public OperationPass<LocationSnapshotPass> {
-public:
+struct LocationSnapshotPass : public OperationPass<LocationSnapshotPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LocationSnapshot
+#include "mlir/Transforms/Passes.h.inc"
+
   LocationSnapshotPass() = default;
   LocationSnapshotPass(const LocationSnapshotPass &) {}
   LocationSnapshotPass(OpPrintingFlags flags, StringRef fileName, StringRef tag)
@@ -139,14 +142,6 @@ class LocationSnapshotPass : public OperationPass<LocationSnapshotPass> {
       return signalPassFailure();
   }
 
-  Option<std::string> fileName{
-      *this, "filename",
-      llvm::cl::desc("The filename to print the generated IR.")};
-  Option<std::string> tag{
-      *this, "tag",
-      llvm::cl::desc("A tag to use when fusing the new locations with the "
-                     "original. If unset, the locations are replaced.")};
-
   /// The printing flags to use when creating the snapshot.
   OpPrintingFlags flags;
 };
@@ -157,6 +152,6 @@ std::unique_ptr<Pass> mlir::createLocationSnapshotPass(OpPrintingFlags flags,
                                                        StringRef tag) {
   return std::make_unique<LocationSnapshotPass>(flags, fileName, tag);
 }
-
-static PassRegistration<LocationSnapshotPass>
-    reg("snapshot-op-locations", "generate new locations from the current IR");
+std::unique_ptr<Pass> mlir::createLocationSnapshotPass() {
+  return std::make_unique<LocationSnapshotPass>();
+}
diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Transforms/LoopCoalescing.cpp
index cfd5ae88bee7..322b3b92c52c 100644
--- a/mlir/lib/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Transforms/LoopCoalescing.cpp
@@ -19,8 +19,11 @@
 using namespace mlir;
 
 namespace {
-class LoopCoalescingPass : public FunctionPass<LoopCoalescingPass> {
-public:
+struct LoopCoalescingPass : public FunctionPass<LoopCoalescingPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_LoopCoalescing
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnFunction() override {
     FuncOp func = getFunction();
 
@@ -89,7 +92,3 @@ class LoopCoalescingPass : public FunctionPass<LoopCoalescingPass> {
 std::unique_ptr<OpPassBase<FuncOp>> mlir::createLoopCoalescingPass() {
   return std::make_unique<LoopCoalescingPass>();
 }
-
-static PassRegistration<LoopCoalescingPass>
-    reg(PASS_NAME,
-        "coalesce nested loops with independent bounds into a single loop");
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
index 2f08f95261f2..55b45c0595a6 100644
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -68,7 +68,6 @@ static llvm::cl::opt<unsigned long long> clFusionLocalBufThreshold(
     llvm::cl::cat(clOptionsCategory));
 
 namespace {
-
 /// Loop fusion pass. This pass currently supports a greedy fusion policy,
 /// which fuses loop nests with single-writer/single-reader memref dependences
 /// with the goal of improving locality.
@@ -79,6 +78,10 @@ namespace {
 // and add support for more general loop fusion algorithms.
 
 struct LoopFusion : public FunctionPass<LoopFusion> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffineLoopFusion
+#include "mlir/Transforms/Passes.h.inc"
+
   LoopFusion(unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0,
              bool maximalFusion = false)
       : localBufSizeThreshold(localBufSizeThreshold),
@@ -1973,6 +1976,3 @@ void LoopFusion::runOnFunction() {
     GreedyFusion(&g, localBufSizeThreshold, fastMemorySpace, maximalFusion)
         .run();
 }
-
-static PassRegistration<LoopFusion> pass("affine-loop-fusion",
-                                         "Fuse loop nests");
diff --git a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
index e616663e11fc..7407676d5877 100644
--- a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
@@ -28,7 +28,10 @@ using namespace mlir;
 namespace {
 /// Loop invariant code motion (LICM) pass.
 struct LoopInvariantCodeMotion : public OperationPass<LoopInvariantCodeMotion> {
-public:
+/// Include the generated pass utilities.
+#define GEN_PASS_LoopInvariantCodeMotion
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnOperation() override;
 };
 } // end anonymous namespace
@@ -121,7 +124,3 @@ void LoopInvariantCodeMotion::runOnOperation() {
 std::unique_ptr<Pass> mlir::createLoopInvariantCodeMotionPass() {
   return std::make_unique<LoopInvariantCodeMotion>();
 }
-
-static PassRegistration<LoopInvariantCodeMotion>
-    pass("loop-invariant-code-motion",
-         "Hoist loop invariant instructions outside of the loop");
diff --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
index eaf5c744723c..e251edaf38cb 100644
--- a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
+++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
@@ -28,7 +28,6 @@
 using namespace mlir;
 
 namespace {
-
 // The store to load forwarding relies on three conditions:
 //
 // 1) they need to have mathematically equivalent affine access functions
@@ -62,6 +61,10 @@ namespace {
 // than dealloc) remain.
 //
 struct MemRefDataFlowOpt : public FunctionPass<MemRefDataFlowOpt> {
+/// Include the generated pass utilities.
+#define GEN_PASS_MemRefDataFlowOpt
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnFunction() override;
 
   void forwardStoreToLoad(AffineLoadOp loadOp);
@@ -222,6 +225,3 @@ void MemRefDataFlowOpt::runOnFunction() {
     defInst->erase();
   }
 }
-
-static PassRegistration<MemRefDataFlowOpt>
-    pass("memref-dataflow-opt", "Perform store/load forwarding for memrefs");
diff --git a/mlir/lib/Transforms/OpStats.cpp b/mlir/lib/Transforms/OpStats.cpp
index 0377b719f6d2..b7832f580dd4 100644
--- a/mlir/lib/Transforms/OpStats.cpp
+++ b/mlir/lib/Transforms/OpStats.cpp
@@ -19,6 +19,10 @@ using namespace mlir;
 
 namespace {
 struct PrintOpStatsPass : public ModulePass<PrintOpStatsPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_PrintOpStats
+#include "mlir/Transforms/Passes.h.inc"
+
   explicit PrintOpStatsPass(raw_ostream &os = llvm::errs()) : os(os) {}
 
   // Prints the resultant operation statistics post iterating over the module.
@@ -84,6 +88,3 @@ void PrintOpStatsPass::printSummary() {
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createPrintOpStatsPass() {
   return std::make_unique<PrintOpStatsPass>();
 }
-
-static PassRegistration<PrintOpStatsPass>
-    pass("print-op-stats", "Print statistics of operations");
diff --git a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Transforms/ParallelLoopCollapsing.cpp
index 4660af0e3d84..29aded6465c7 100644
--- a/mlir/lib/Transforms/ParallelLoopCollapsing.cpp
+++ b/mlir/lib/Transforms/ParallelLoopCollapsing.cpp
@@ -15,13 +15,16 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
-#define PASS_NAME "parallel-loop-collapsing"
-#define DEBUG_TYPE PASS_NAME
+#define DEBUG_TYPE "parallel-loop-collapsing"
 
 using namespace mlir;
 
 namespace {
 struct ParallelLoopCollapsing : public OperationPass<ParallelLoopCollapsing> {
+/// Include the generated pass utilities.
+#define GEN_PASS_ParallelLoopCollapsing
+#include "mlir/Transforms/Passes.h.inc"
+
   ParallelLoopCollapsing() = default;
   ParallelLoopCollapsing(const ParallelLoopCollapsing &) {}
   void runOnOperation() override {
@@ -37,26 +40,9 @@ struct ParallelLoopCollapsing : public OperationPass<ParallelLoopCollapsing> {
         combinedLoops.push_back(clCollapsedIndices1);
       if (clCollapsedIndices2.size())
         combinedLoops.push_back(clCollapsedIndices2);
-      collapsePLoops(op, combinedLoops);
+      collapseParallelLoops(op, combinedLoops);
     });
   }
-
-  ListOption<unsigned> clCollapsedIndices0{
-      *this, "collapsed-indices-0",
-      llvm::cl::desc("Which loop indices to combine 0th loop index"),
-      llvm::cl::MiscFlags::CommaSeparated};
-
-  ListOption<unsigned> clCollapsedIndices1{
-      *this, "collapsed-indices-1",
-      llvm::cl::desc(
-          "Which loop indices to combine into the position 1 loop index"),
-      llvm::cl::MiscFlags::CommaSeparated};
-
-  ListOption<unsigned> clCollapsedIndices2{
-      *this, "collapsed-indices-2",
-      llvm::cl::desc(
-          "Which loop indices to combine into the position 2 loop index"),
-      llvm::cl::MiscFlags::CommaSeparated};
 };
 
 } // namespace
@@ -64,6 +50,3 @@ struct ParallelLoopCollapsing : public OperationPass<ParallelLoopCollapsing> {
 std::unique_ptr<Pass> mlir::createParallelLoopCollapsingPass() {
   return std::make_unique<ParallelLoopCollapsing>();
 }
-
-static PassRegistration<ParallelLoopCollapsing>
-    reg(PASS_NAME, "collapse parallel loops to use less induction variables.");
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index df7bafc4b90b..4c1030336540 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -28,8 +28,11 @@
 using namespace mlir;
 
 namespace {
-
 struct PipelineDataTransfer : public FunctionPass<PipelineDataTransfer> {
+/// Include the generated pass utilities.
+#define GEN_PASS_AffinePipelineDataTransfer
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnFunction() override;
   void runOnAffineForOp(AffineForOp forOp);
 
@@ -342,7 +345,7 @@ void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) {
       instShiftMap[&op] = 1;
 
   // Get shifts stored in map.
-  std::vector<uint64_t> shifts(forOp.getBody()->getOperations().size());
+  SmallVector<uint64_t, 8> shifts(forOp.getBody()->getOperations().size());
   unsigned s = 0;
   for (auto &op : forOp.getBody()->without_terminator()) {
     assert(instShiftMap.find(&op) != instShiftMap.end());
@@ -355,7 +358,7 @@ void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) {
     });
   }
 
-  if (!isInstwiseShiftValid(forOp, shifts)) {
+  if (!isOpwiseShiftValid(forOp, shifts)) {
     // Violates dependences.
     LLVM_DEBUG(llvm::dbgs() << "Shifts invalid - unexpected\n";);
     return;
@@ -366,8 +369,3 @@ void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) {
     return;
   }
 }
-
-static PassRegistration<PipelineDataTransfer> pass(
-    "affine-pipeline-data-transfer",
-    "Pipeline non-blocking data transfers between explicitly managed levels of "
-    "the memory hierarchy");
diff --git a/mlir/lib/Transforms/StripDebugInfo.cpp b/mlir/lib/Transforms/StripDebugInfo.cpp
index 1a8b213febad..d3420cfc35a1 100644
--- a/mlir/lib/Transforms/StripDebugInfo.cpp
+++ b/mlir/lib/Transforms/StripDebugInfo.cpp
@@ -15,6 +15,10 @@ using namespace mlir;
 
 namespace {
 struct StripDebugInfo : public OperationPass<StripDebugInfo> {
+/// Include the generated pass utilities.
+#define GEN_PASS_StripDebugInfo
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnOperation() override;
 };
 } // end anonymous namespace
@@ -29,6 +33,3 @@ void StripDebugInfo::runOnOperation() {
 std::unique_ptr<Pass> mlir::createStripDebugInfoPass() {
   return std::make_unique<StripDebugInfo>();
 }
-
-static PassRegistration<StripDebugInfo>
-    pass("strip-debuginfo", "Strip debug info from all operations");
diff --git a/mlir/lib/Transforms/SymbolDCE.cpp b/mlir/lib/Transforms/SymbolDCE.cpp
index 7d1d112c1a81..7513e34af388 100644
--- a/mlir/lib/Transforms/SymbolDCE.cpp
+++ b/mlir/lib/Transforms/SymbolDCE.cpp
@@ -18,6 +18,10 @@ using namespace mlir;
 
 namespace {
 struct SymbolDCE : public OperationPass<SymbolDCE> {
+/// Include the generated pass utilities.
+#define GEN_PASS_SymbolDCE
+#include "mlir/Transforms/Passes.h.inc"
+
   void runOnOperation() override;
 
   /// Compute the liveness of the symbols within the given symbol table.
@@ -156,5 +160,3 @@ LogicalResult SymbolDCE::computeLiveness(Operation *symbolTableOp,
 std::unique_ptr<Pass> mlir::createSymbolDCEPass() {
   return std::make_unique<SymbolDCE>();
 }
-
-static PassRegistration<SymbolDCE> pass("symbol-dce", "Eliminate dead symbols");
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index e3acbfe61d13..42649ae336c0 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -53,15 +53,14 @@ struct LoopParams {
 /// part of the unrolled loop. Computes the bound as an AffineMap with its
 /// operands or a null map when the trip count can't be expressed as an affine
 /// expression.
-void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
-                                    AffineMap *map,
-                                    SmallVectorImpl<Value> *operands,
-                                    OpBuilder &b) {
+static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
+                                     AffineMap &map,
+                                     SmallVectorImpl<Value> &operands) {
   auto lbMap = forOp.getLowerBoundMap();
 
   // Single result lower bound map only.
   if (lbMap.getNumResults() != 1) {
-    *map = AffineMap();
+    map = AffineMap();
     return;
   }
 
@@ -71,11 +70,11 @@ void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
 
   // Sometimes the trip count cannot be expressed as an affine expression.
   if (!tripCountMap) {
-    *map = AffineMap();
+    map = AffineMap();
     return;
   }
 
-  unsigned step = forOp.getStep();
+  OpBuilder b(forOp);
   auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
                                     forOp.getLowerBoundOperands());
 
@@ -86,6 +85,7 @@ void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
   // these affine.apply's make up the cleanup loop lower bound.
   SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
   SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
+  int64_t step = forOp.getStep();
   for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
     auto tripCountExpr = tripCountMap.getResult(i);
     bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
@@ -99,19 +99,19 @@ void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
   for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
     newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
 
-  operands->clear();
-  operands->push_back(lb);
-  operands->append(bumpValues.begin(), bumpValues.end());
-  *map = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs);
+  operands.clear();
+  operands.push_back(lb);
+  operands.append(bumpValues.begin(), bumpValues.end());
+  map = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs);
   // Simplify the map + operands.
-  fullyComposeAffineMapAndOperands(map, operands);
-  *map = simplifyAffineMap(*map);
-  canonicalizeMapAndOperands(map, operands);
+  fullyComposeAffineMapAndOperands(&map, &operands);
+  map = simplifyAffineMap(map);
+  canonicalizeMapAndOperands(&map, &operands);
   // Remove any affine.apply's that became dead from the simplification above.
-  for (auto v : bumpValues) {
+  for (auto v : bumpValues)
     if (v.use_empty())
       v.getDefiningOp()->erase();
-  }
+
   if (lb.use_empty())
     lb.erase();
 }
@@ -121,16 +121,15 @@ void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
 // TODO(bondhugula): extend this for arbitrary affine bounds.
 LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
   Optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount.hasValue() || tripCount.getValue() != 1)
+  if (!tripCount || tripCount.getValue() != 1)
     return failure();
 
-  // TODO(mlir-team): there is no builder for a max.
   if (forOp.getLowerBoundMap().getNumResults() != 1)
     return failure();
 
   // Replaces all IV uses to its single iteration value.
   auto iv = forOp.getInductionVar();
-  Operation *op = forOp.getOperation();
+  auto *parentBlock = forOp.getOperation()->getBlock();
   if (!iv.use_empty()) {
     if (forOp.hasConstantLowerBound()) {
       OpBuilder topBuilder(forOp.getParentOfType<FuncOp>().getBody());
@@ -140,7 +139,7 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
     } else {
       auto lbOperands = forOp.getLowerBoundOperands();
       auto lbMap = forOp.getLowerBoundMap();
-      OpBuilder builder(op->getBlock(), Block::iterator(op));
+      OpBuilder builder(parentBlock, Block::iterator(forOp));
       if (lbMap == builder.getDimIdentityMap()) {
         // No need of generating an affine.apply.
         iv.replaceAllUsesWith(lbOperands[0]);
@@ -151,17 +150,16 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
       }
     }
   }
-  // Move the loop body operations, except for terminator, to the loop's
+  // Move the loop body operations, except for its terminator, to the loop's
   // containing block.
-  auto *block = op->getBlock();
-  forOp.getBody()->getOperations().back().erase();
-  block->getOperations().splice(Block::iterator(op),
-                                forOp.getBody()->getOperations());
+  forOp.getBody()->back().erase();
+  parentBlock->getOperations().splice(Block::iterator(forOp),
+                                      forOp.getBody()->getOperations());
   forOp.erase();
   return success();
 }
 
-/// Promotes all single iteration for op's in the FuncOp, i.e., moves
+/// Promotes all single iteration 'for' ops in `f`, i.e., moves
 /// their body into the containing Block.
 void mlir::promoteSingleIterationLoops(FuncOp f) {
   // Gathers all innermost loops through a post order pruned walk.
@@ -233,6 +231,8 @@ static AffineForOp generateShiftedLoop(
 LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
                                         ArrayRef<uint64_t> shifts,
                                         bool unrollPrologueEpilogue) {
+  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
+         "too few/many shifts");
   if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
     return success();
 
@@ -247,20 +247,17 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
   }
   uint64_t tripCount = mayBeConstTripCount.getValue();
 
-  assert(isInstwiseShiftValid(forOp, shifts) &&
+  assert(isOpwiseShiftValid(forOp, shifts) &&
          "shifts will lead to an invalid transformation\n");
 
   int64_t step = forOp.getStep();
 
-  unsigned numChildInsts = forOp.getBody()->getOperations().size();
+  unsigned numChildOps = shifts.size();
 
   // Do a linear time (counting) sort for the shifts.
-  uint64_t maxShift = 0;
-  for (unsigned i = 0; i < numChildInsts; i++) {
-    maxShift = std::max(maxShift, shifts[i]);
-  }
-  // Such large shifts are not the typical use case.
-  if (maxShift >= numChildInsts) {
+  uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end());
+  if (maxShift >= numChildOps) {
+    // Large shifts are not the typical use case.
     forOp.emitWarning("not shifting because shifts are unrealistically large");
     return success();
   }
@@ -289,13 +286,13 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
 
   auto origLbMap = forOp.getLowerBoundMap();
   uint64_t lbShift = 0;
-  OpBuilder b(forOp.getOperation());
+  OpBuilder b(forOp);
   for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
     // If nothing is shifted by d, continue.
     if (sortedOpGroups[d].empty())
       continue;
     if (!opGroupQueue.empty()) {
-      assert(d >= 1 &&
+      assert(d > 0 &&
              "Queue expected to be empty when the first block is found");
       // The interval for which the loop needs to be generated here is:
       // [lbShift, min(lbShift + tripCount, d)) and the body of the
@@ -343,8 +340,7 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
 
   if (unrollPrologueEpilogue && prologue)
     loopUnrollFull(prologue);
-  if (unrollPrologueEpilogue && !epilogue &&
-      epilogue.getOperation() != prologue.getOperation())
+  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
     loopUnrollFull(epilogue);
 
   return success();
@@ -389,9 +385,8 @@ LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   if (mayBeConstantTripCount.hasValue()) {
     uint64_t tripCount = mayBeConstantTripCount.getValue();
-    if (tripCount == 1) {
+    if (tripCount == 1)
       return promoteIfSingleIteration(forOp);
-    }
     return loopUnrollByFactor(forOp, tripCount);
   }
   return failure();
@@ -413,14 +408,14 @@ LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
 /// is successfully unrolled.
 LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
                                        uint64_t unrollFactor) {
-  assert(unrollFactor >= 1 && "unroll factor should be >= 1");
+  assert(unrollFactor > 0 && "unroll factor should be positive");
 
   if (unrollFactor == 1)
     return promoteIfSingleIteration(forOp);
 
-  if (forOp.getBody()->empty() ||
-      forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
-    return failure();
+  // Nothing in the loop body other than the terminator.
+  if (has_single_element(forOp.getBody()->getOperations()))
+    return success();
 
   // Loops where the lower bound is a max expression isn't supported for
   // unrolling since the trip count can be expressed as an affine function when
@@ -438,20 +433,19 @@ LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
     return failure();
 
   // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
-  Operation *op = forOp.getOperation();
   if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
-    OpBuilder builder(op->getBlock(), ++Block::iterator(op));
-    auto cleanupForInst = cast<AffineForOp>(builder.clone(*op));
+    OpBuilder builder(forOp.getOperation()->getBlock(),
+                      std::next(Block::iterator(forOp)));
+    auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
     AffineMap cleanupMap;
     SmallVector<Value, 4> cleanupOperands;
-    getCleanupLoopLowerBound(forOp, unrollFactor, &cleanupMap, &cleanupOperands,
-                             builder);
+    getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
     assert(cleanupMap &&
            "cleanup loop lower bound map for single result lower bound maps "
            "can always be determined");
-    cleanupForInst.setLowerBound(cleanupOperands, cleanupMap);
+    cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
     // Promote the loop body up if this has turned into a single iteration loop.
-    promoteIfSingleIteration(cleanupForInst);
+    promoteIfSingleIteration(cleanupForOp);
 
     // Adjust upper bound of the original loop; this is the same as the lower
     // bound of the cleanup loop.
@@ -470,7 +464,7 @@ LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
   // so that we know what to clone (since we are doing this in-place).
   Block::iterator srcBlockEnd = std::prev(forOp.getBody()->end(), 2);
 
-  // Unroll the contents of 'forOp' (append unrollFactor-1 additional copies).
+  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
   auto forOpIV = forOp.getInductionVar();
   for (unsigned i = 1; i < unrollFactor; i++) {
     BlockAndValueMapping operandMap;
@@ -501,7 +495,6 @@ LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
 LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
                                             uint64_t unrollJamFactor) {
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-
   if (mayBeConstantTripCount.hasValue() &&
       mayBeConstantTripCount.getValue() < unrollJamFactor)
     return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue());
@@ -524,6 +517,7 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
         for (auto &block : region)
           walk(block);
     }
+
     void walk(Block &block) {
       for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
         auto subBlockStart = it;
@@ -531,21 +525,21 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
           ++it;
         if (it != subBlockStart)
           subBlocks.push_back({subBlockStart, std::prev(it)});
-        // Process all for insts that appear next.
+        // Process all for ops that appear next.
         while (it != e && isa<AffineForOp>(&*it))
           walk(&*it++);
       }
     }
   };
 
-  assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1");
+  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
 
   if (unrollJamFactor == 1)
     return promoteIfSingleIteration(forOp);
 
-  if (forOp.getBody()->empty() ||
-      forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
-    return failure();
+  // Nothing in the loop body other than the terminator.
+  if (has_single_element(forOp.getBody()->getOperations()))
+    return success();
 
   // Loops where both lower and upper bounds are multi-result maps won't be
   // unrolled (since the trip can't be expressed as an affine function in
@@ -559,28 +553,29 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
   Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   // If the trip count is lower than the unroll jam factor, no unroll jam.
   if (mayBeConstantTripCount.hasValue() &&
-      mayBeConstantTripCount.getValue() < unrollJamFactor)
+      mayBeConstantTripCount.getValue() < unrollJamFactor) {
+    LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
     return failure();
-
-  auto *forInst = forOp.getOperation();
+  }
 
   // Gather all sub-blocks to jam upon the loop being unrolled.
   JamBlockGatherer jbg;
-  jbg.walk(forInst);
+  jbg.walk(forOp);
   auto &subBlocks = jbg.subBlocks;
 
   // Generate the cleanup loop if trip count isn't a multiple of
   // unrollJamFactor.
   if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
     // Insert the cleanup loop right after 'forOp'.
-    OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst)));
-    auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forInst));
+    OpBuilder builder(forOp.getOperation()->getBlock(),
+                      std::next(Block::iterator(forOp)));
+    auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forOp));
     // Adjust the lower bound of the cleanup loop; its upper bound is the same
     // as the original loop's upper bound.
     AffineMap cleanupMap;
     SmallVector<Value, 4> cleanupOperands;
-    getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap,
-                             &cleanupOperands, builder);
+    getCleanupLoopLowerBound(forOp, unrollJamFactor, cleanupMap,
+                             cleanupOperands);
     cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap);
 
     // Promote the cleanup loop if it has turned into a single iteration loop.
@@ -599,7 +594,7 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
   // Unroll and jam (appends unrollJamFactor - 1 additional copies).
   for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
     // Operand map persists across all sub-blocks.
-    BlockAndValueMapping operandMapping;
+    BlockAndValueMapping operandMap;
     for (auto &subBlock : subBlocks) {
       // Builder to insert unroll-jammed bodies. Insert right at the end of
       // sub-block.
@@ -612,13 +607,12 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
         auto d0 = builder.getAffineDimExpr(0);
         auto bumpMap = AffineMap::get(1, 0, {d0 + i * step});
         auto ivUnroll =
-            builder.create<AffineApplyOp>(forInst->getLoc(), bumpMap, forOpIV);
-        operandMapping.map(forOpIV, ivUnroll);
+            builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
+        operandMap.map(forOpIV, ivUnroll);
       }
       // Clone the sub-block being unroll-jammed.
-      for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) {
-        builder.clone(*it, operandMapping);
-      }
+      for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
+        builder.clone(*it, operandMap);
     }
   }
 
@@ -630,8 +624,6 @@ LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
 /// nested within 'forOpA' as the only non-terminator operation in its block.
 void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
-  auto *forOpAInst = forOpA.getOperation();
-
   assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
   auto &forOpABody = forOpA.getBody()->getOperations();
   auto &forOpBBody = forOpB.getBody()->getOperations();
@@ -639,16 +631,17 @@ void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
   // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
   // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
   // body containing only the terminator.
-  forOpAInst->getBlock()->getOperations().splice(Block::iterator(forOpAInst),
-                                                 forOpABody, forOpABody.begin(),
-                                                 std::prev(forOpABody.end()));
+  forOpA.getOperation()->getBlock()->getOperations().splice(
+      Block::iterator(forOpA), forOpABody, forOpABody.begin(),
+      std::prev(forOpABody.end()));
   // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
   // body (this leaves forOpB's body containing only the terminator).
   forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
                     std::prev(forOpBBody.end()));
   // 3) Splice forOpA into the beginning of forOpB's body.
-  forOpBBody.splice(forOpBBody.begin(), forOpAInst->getBlock()->getOperations(),
-                    Block::iterator(forOpAInst));
+  forOpBBody.splice(forOpBBody.begin(),
+                    forOpA.getOperation()->getBlock()->getOperations(),
+                    Block::iterator(forOpA));
 }
 
 // Checks each dependence component against the permutation to see if the
@@ -701,7 +694,7 @@ bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
 }
 
 /// Return true if `loops` is a perfect nest.
-static bool isPerfectlyNested(ArrayRef<AffineForOp> loops) {
+static bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops) {
   auto outerLoop = loops.front();
   for (auto loop : loops.drop_front()) {
     auto parentForOp = dyn_cast<AffineForOp>(loop.getParentOp());
@@ -873,8 +866,8 @@ stripmineSink(AffineForOp forOp, uint64_t factor,
   auto scaledStep = originalStep * factor;
   forOp.setStep(scaledStep);
 
-  auto *op = forOp.getOperation();
-  OpBuilder b(op->getBlock(), ++Block::iterator(op));
+  OpBuilder b(forOp.getOperation()->getBlock(),
+              std::next(Block::iterator(forOp)));
 
   // Lower-bound map creation.
   auto lbMap = forOp.getLowerBoundMap();
@@ -1180,7 +1173,6 @@ static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
   if (auto stepCst = dyn_cast_or_null<ConstantIndexOp>(step.getDefiningOp()))
     isStepOne = stepCst.getValue() == 1;
 
-
   // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
   // assuming the step is strictly positive.  Update the bounds and the step
   // of the loop to go from 0 to the number of iterations, if necessary.
@@ -1226,7 +1218,7 @@ static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
 static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
                           loop::ForOp inner) {
   OpBuilder builder(outer);
-  OpBuilder innerBuilder(inner.getBody(), inner.getBody()->begin());
+  OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
   auto loopPieces =
       normalizeLoop(builder, innerBuilder, loop.getLoc(), loop.lowerBound(),
                     loop.upperBound(), loop.step(), loop.getInductionVar());
@@ -1291,8 +1283,9 @@ void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
   second.erase();
 }
 
-void mlir::collapsePLoops(loop::ParallelOp loops,
-                          ArrayRef<std::vector<unsigned>> combinedDimensions) {
+void mlir::collapseParallelLoops(
+    loop::ParallelOp loops,
+    ArrayRef<std::vector<unsigned>> combinedDimensions) {
   OpBuilder outsideBuilder(loops);
   Location loc = loops.getLoc();
 
@@ -1301,7 +1294,7 @@ void mlir::collapsePLoops(loop::ParallelOp loops,
   SmallVector<Value, 3> normalizedSteps;
   SmallVector<Value, 3> normalizedUpperBounds;
   for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
-    OpBuilder insideLoopBuilder(loops.getBody(), loops.getBody()->begin());
+    OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
     auto resultBounds =
         normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
                       loops.lowerBound()[i], loops.upperBound()[i],
@@ -1312,7 +1305,7 @@ void mlir::collapsePLoops(loop::ParallelOp loops,
     normalizedSteps.push_back(resultBounds.step);
   }
 
-  // Combine iteration spaces
+  // Combine iteration spaces.
   SmallVector<Value, 3> lowerBounds;
   SmallVector<Value, 3> steps;
   SmallVector<Value, 3> upperBounds;
@@ -1337,7 +1330,7 @@ void mlir::collapsePLoops(loop::ParallelOp loops,
   // that is un-normalized already by the previous logic.
   auto newPloop = outsideBuilder.create<loop::ParallelOp>(loc, lowerBounds,
                                                           upperBounds, steps);
-  OpBuilder insideBuilder(newPloop.getBody(), newPloop.getBody()->begin());
+  OpBuilder insideBuilder(newPloop.region());
   for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
     Value previous = newPloop.getBody()->getArgument(i);
     unsigned numberCombinedDimensions = combinedDimensions[i].size();
@@ -1822,16 +1815,16 @@ static LogicalResult generateCopy(
 
 /// Construct the memref region to just include the entire memref. Returns false
 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
-/// enclosing loop IVs of opInst (starting from the outermost) that the region
+/// enclosing loop IVs of `op` (starting from the outermost) that the region
 /// is parametric on.
-static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
+static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
                                   MemRefRegion *region) {
   unsigned rank;
-  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
     rank = loadOp.getMemRefType().getRank();
     region->memref = loadOp.getMemRef();
     region->setWrite(false);
-  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
     rank = storeOp.getMemRefType().getRank();
     region->memref = storeOp.getMemRef();
     region->setWrite(true);
@@ -1848,7 +1841,7 @@ static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
   // Just get the first numSymbols IVs, which the memref region is parametric
   // on.
   SmallVector<AffineForOp, 4> ivs;
-  getLoopIVs(*opInst, &ivs);
+  getLoopIVs(*op, &ivs);
   ivs.resize(numParamLoopIVs);
   SmallVector<Value, 4> symbols;
   extractForInductionVars(ivs, &symbols);
@@ -2139,8 +2132,10 @@ AffineForOp mlir::createCanonicalizedAffineForOp(
 
   fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
   canonicalizeMapAndOperands(&lbMap, &lowerOperands);
+  lbMap = removeDuplicateExprs(lbMap);
   fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
   canonicalizeMapAndOperands(&ubMap, &upperOperands);
+  ubMap = removeDuplicateExprs(ubMap);
 
   return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
                                step);
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index e5083b512f5c..fcaff9a0b069 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -101,6 +101,10 @@ namespace {
 // Note: this is a module pass only to avoid interleaving on the same ostream
 // due to multi-threading over functions.
 struct PrintOpPass : public ModulePass<PrintOpPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_PrintOpGraph
+#include "mlir/Transforms/Passes.h.inc"
+
   explicit PrintOpPass(raw_ostream &os = llvm::errs(), bool short_names = false,
                        const Twine &title = "")
       : os(os), title(title.str()), short_names(short_names) {}
@@ -161,6 +165,3 @@ mlir::createPrintOpGraphPass(raw_ostream &os, bool shortNames,
                              const Twine &title) {
   return std::make_unique<PrintOpPass>(os, shortNames, title);
 }
-
-static PassRegistration<PrintOpPass> pass("print-op-graph",
-                                          "Print op graph per region");
diff --git a/mlir/lib/Transforms/ViewRegionGraph.cpp b/mlir/lib/Transforms/ViewRegionGraph.cpp
index f9e2929b8ada..cf9ff6d8077e 100644
--- a/mlir/lib/Transforms/ViewRegionGraph.cpp
+++ b/mlir/lib/Transforms/ViewRegionGraph.cpp
@@ -61,6 +61,10 @@ void mlir::Region::viewGraph() { viewGraph("region"); }
 
 namespace {
 struct PrintCFGPass : public FunctionPass<PrintCFGPass> {
+/// Include the generated pass utilities.
+#define GEN_PASS_PrintCFG
+#include "mlir/Transforms/Passes.h.inc"
+
   PrintCFGPass(raw_ostream &os = llvm::errs(), bool shortNames = false,
                const Twine &title = "")
       : os(os), shortNames(shortNames), title(title.str()) {}
@@ -80,6 +84,3 @@ mlir::createPrintCFGGraphPass(raw_ostream &os, bool shortNames,
                               const Twine &title) {
   return std::make_unique<PrintCFGPass>(os, shortNames, title);
 }
-
-static PassRegistration<PrintCFGPass> pass("print-cfg-graph",
-                                           "Print CFG graph per Function");
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
index 18b410ae7f58..18ef4e201c8d 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
@@ -344,7 +344,7 @@ func @loop_min_max(%N : index) {
   return
 }
 
-#map_7_values = affine_map<(i) -> (i, i, i, i, i, i, i)>
+#map_7_values = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>
 
 // Check that the "min" (cmpi "slt" + select) reduction sequence is emitted
 // correctly for a an affine map with 7 results.
@@ -369,8 +369,8 @@ func @loop_min_max(%N : index) {
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
 // CHECK-NEXT: }
-func @min_reduction_tree(%v : index) {
-  affine.for %i = 0 to min #map_7_values(%v)[] {
+func @min_reduction_tree(%v1 : index, %v2 : index, %v3 : index, %v4 : index, %v5 : index, %v6 : index, %v7 : index) {
+  affine.for %i = 0 to min #map_7_values(%v1, %v2, %v3, %v4, %v5, %v6, %v7)[] {
     call @body(%i) : (index) -> ()
   }
   return
@@ -619,4 +619,4 @@ func @affine_max(%arg0: index, %arg1: index) -> index{
   // CHECK: select %[[cmp]], %[[first]], %[[second]]
   %0 = affine.max affine_map<(d0,d1) -> (d0 - d1, d1 - d0)>(%arg0, %arg1)
   return %0 : index
-}
\ No newline at end of file
+}
diff --git a/mlir/test/Dialect/Affine/affine-data-copy.mlir b/mlir/test/Dialect/Affine/affine-data-copy.mlir
index e9543e5280b9..48e7908aac0a 100644
--- a/mlir/test/Dialect/Affine/affine-data-copy.mlir
+++ b/mlir/test/Dialect/Affine/affine-data-copy.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-skip-non-unit-stride-loops | FileCheck %s
 // Small buffer size to trigger fine copies.
-// RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
+// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
 
 // Test affine data copy with a memref filter. We use a test pass that invokes
 // affine data copy utility on the input loop nest.
@@ -14,12 +14,14 @@
 // footprint -- so that one could write a definite test case and not have to
 // update it each time something related to the cost functions change.
 
-#map0 = affine_map<(d0) -> (d0)>
-#map1 = affine_map<(d0) -> (d0 + 128)>
+#id = affine_map<(d0) -> (d0)>
+#ub = affine_map<(d0) -> (d0 + 128)>
 
 // Map used to index the original memref while copying.
 // CHECK-DAG: [[MEM_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)>
 // Map used to index the buffer while computing.
+// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: [[MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
 // CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>
 
 // CHECK-LABEL: func @matmul
@@ -28,9 +30,9 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
   affine.for %i = 0 to 4096 step 128 {
     affine.for %j = 0 to 4096 step 128 {
       affine.for %k = 0 to 4096 step 128 {
-        affine.for %ii = #map0(%i) to #map1(%i) {
-          affine.for %jj = #map0(%j) to #map1(%j) {
-            affine.for %kk = #map0(%k) to #map1(%k) {
+        affine.for %ii = #id(%i) to #ub(%i) {
+          affine.for %jj = #id(%j) to #ub(%j) {
+            affine.for %kk = #id(%k) to #ub(%k) {
               %5 = affine.load %A[%ii, %kk] : memref<4096x4096xf32>
               %6 = affine.load %B[%kk, %jj] : memref<4096x4096xf32>
               %7 = affine.load %C[%ii, %jj] : memref<4096x4096xf32>
@@ -55,10 +57,10 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 // The result matrix's copy gets hoisted out.
 // Result matrix copy-in.
 // CHECK:     affine.for %{{.*}} = 0 to 128 {
-// CHECK:       %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:       affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
 // CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:         affine.store %{{.*}}, [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
 // CHECK:       }
 // CHECK:     }
@@ -67,10 +69,10 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 // CHECK:     affine.for %{{.*}} = 0 to 4096 step 128 {
 // CHECK:      [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32>
 // CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
 // CHECK:         affine.for %{{.*}} = 0 to 128 {
-// CHECK:           %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:           %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:           affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:           affine.store %{{.*}}, [[BUFA]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
 // CHECK:         }
 // CHECK:       }
@@ -78,23 +80,23 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 // RHS matrix copy-in.
 // CHECK:       [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32>
 // CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
 // CHECK:         affine.for %{{.*}} = 0 to 128 {
-// CHECK:           %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK:           %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:           affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:           affine.store %{{.*}}, [[BUFB]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
 // CHECK:         }
 // CHECK:       }
 
 // Computation on the fast buffers.
-// CHECK:       affine.for %{{.*}} = #map7(%{{.*}}) to #map8(%{{.*}}) {
-// CHECK:         affine.for %{{.*}} = #map7(%{{.*}}) to #map8(%{{.*}}) {
-// CHECK:           affine.for %{{.*}} = #map7(%{{.*}}) to #map8(%{{.*}}) {
-// CHECK:             %{{.*}} = affine.load [[BUFA]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
-// CHECK:             %{{.*}} = affine.load [[BUFB]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
-// CHECK:             %{{.*}} = affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
-// CHECK:             %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
-// CHECK:             %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK:       affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:         affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:           affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK:             affine.load [[BUFA]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:             affine.load [[BUFB]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:             affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:             mulf %{{.*}}, %{{.*}} : f32
+// CHECK:             addf %{{.*}}, %{{.*}} : f32
 // CHECK:             affine.store %{{.*}}, [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
 // CHECK:           }
 // CHECK:         }
@@ -102,14 +104,14 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 // CHECK:       dealloc [[BUFB]] : memref<128x128xf32>
 // CHECK:       dealloc [[BUFA]] : memref<128x128xf32>
 // CHECK:     }
-// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}}, %{{.*}})
-// CHECK:     %{{.*}} = affine.apply #map1(%{{.*}}, %{{.*}})
+// CHECK:     affine.apply #map{{.*}}(%{{.*}}, %{{.*}})
+// CHECK:     affine.apply #map{{.*}}(%{{.*}}, %{{.*}})
 
 // Result matrix copy out.
 // CHECK:     affine.for %{{.*}} = 0 to 128 {
-// CHECK:       %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:       affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
 // CHECK:       affine.for %{{.*}} = 0 to 128 {
-// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
 // CHECK:         [[BUFA]] = affine.load [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
 // CHECK:         store [[BUFA]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
 // CHECK:       }
@@ -125,11 +127,11 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 //  FILTER-NOT:   alloc()
 //      FILTER:   affine.for %{{.*}} = 0 to 128 {
 //      FILTER:     affine.for %{{.*}} = 0 to 4096 {
-//      FILTER:     affine.for %{{.*}} = 0 to 4096 step 128 {
-// FILTER-NEXT:       affine.for %{{.*}} = 0 to 4096 step 128 {
+//      FILTER:   affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:     affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:       affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 // FILTER-NEXT:         affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 // FILTER-NEXT:           affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
-// FILTER-NEXT:             affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
 //      FILTER:   dealloc %1 : memref<128x4096xf32>
 //  FILTER-NOT:   dealloc %1 : memref<128x4096xf32>
 
@@ -157,32 +159,32 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
 }
 // CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
 // CHECK-SMALL:   affine.for %arg{{.*}} = 0 to 1024 {
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:     %{{.*}} = alloc() : memref<1x1xf32>
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:     %{{.*}} = affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     alloc() : memref<1x1xf32>
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
 // CHECK-SMALL:     affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
 // CHECK-SMALL:     affine.for %arg{{.*}} = 0 to 1024 {
-// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:       %{{.*}} = alloc() : memref<1x1xf32>
-// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:       %{{.*}} = affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:       alloc() : memref<1x1xf32>
+// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:       affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:       affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
 // CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
-// CHECK-SMALL:       %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
-// CHECK-SMALL:       %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
-// CHECK-SMALL:       %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-SMALL:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-SMALL:       addf %{{.*}}, %{{.*}} : f32
 // CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
 // CHECK-SMALL:       dealloc %{{.*}} : memref<1x1xf32>
 // CHECK-SMALL:     }
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL:     %{{.*}} = affine.load %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     affine.load %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
 // CHECK-SMALL:     affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
 // CHECK-SMALL:     dealloc %{{.*}} : memref<1x1xf32>
 // CHECK-SMALL:   }
@@ -200,6 +202,7 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
 // FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
 //      FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
 //  FILTER-NOT: dealloc
+//  FILTER:     return
 
 // CHeck that only one memref is copied, because for-memref-region is enabled
 // (and the first ever encountered load is analyzed).
@@ -211,4 +214,4 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
 // MEMREF_REGION-NEXT:   affine.for %{{.*}} = 0 to 1024 {
 // MEMREF_REGION-NEXT:     affine.for %{{.*}} = 0 to 1024 {
 //      MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
-//  MEMREF_REGION-NOT: dealloc
+// MEMREF_REGION-NOT: dealloc
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
index 9e9787e8537d..90437ac7ce5f 100644
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -594,3 +594,15 @@ func @rep(%arg0 : index, %arg1 : index) -> index {
   %1 = affine.min #map2(%0)[%arg1]
   return %1 : index
 }
+
+// -----
+// CHECK-DAG: #[[lb:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[ub:.*]] = affine_map<()[s0] -> (s0 + 2)>
+
+func @drop_duplicate_bounds(%N : index) {
+  // affine.for %i = max #lb(%arg0) to min #ub(%arg0)
+  affine.for %i = max affine_map<(d0) -> (d0, d0)>(%N) to min affine_map<(d0) -> (d0 + 2, d0 + 2)>(%N) {
+    "foo"() : () -> ()
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Affine/dma-generate.mlir b/mlir/test/Dialect/Affine/dma-generate.mlir
index e2680a09b052..3b90cc6ec0a0 100644
--- a/mlir/test/Dialect/Affine/dma-generate.mlir
+++ b/mlir/test/Dialect/Affine/dma-generate.mlir
@@ -22,28 +22,28 @@ func @loop_nest_1d() {
   %B = alloc() : memref<512 x f32>
   %F = alloc() : memref<256 x f32, 2>
   // First DMA buffer.
-  // CHECK:  %{{.*}} = alloc() : memref<256xf32>
-  // CHECK:  %{{.*}} = alloc() : memref<256xf32, 2>
+  // CHECK:  alloc() : memref<256xf32>
+  // CHECK:  alloc() : memref<256xf32, 2>
   // Tag for first DMA.
-  // CHECK:  %{{.*}} = alloc() : memref<1xi32>
+  // CHECK:  alloc() : memref<1xi32>
   // First DMA transfer.
   // CHECK:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>
   // CHECK:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
   // Second DMA buffer.
-  // CHECK:  %{{.*}} = alloc() : memref<256xf32, 2>
+  // CHECK:  alloc() : memref<256xf32, 2>
   // Tag for second DMA.
-  // CHECK:  %{{.*}} = alloc() : memref<1xi32>
+  // CHECK:  alloc() : memref<1xi32>
   // Second DMA transfer.
   // CHECK:       affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32>
   // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
   // CHECK: affine.for %{{.*}} = 0 to 256 {
-      // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
-      // CHECK:      %{{.*}} = affine.apply [[MAP_PLUS_256]](%{{.*}})
+      // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+      // CHECK:      affine.apply [[MAP_PLUS_256]](%{{.*}})
       // Buffer for '%{{.*}}' in faster memref space is smaller size: 256xf32
       // Affine map for 'affine.load %{{.*}}' is composed: %{{.*}} + 256 - 256 = %{{.*}}.
       // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
       // Already in faster memory space.
-      // CHECK:     %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+      // CHECK:     affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
   // CHECK-NEXT: }
   // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
   // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
@@ -83,20 +83,20 @@ func @loop_nest_1d() {
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
 // CHECK-NEXT:      affine.for %{{.*}} = 0 to 32 {
 // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
-// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
 // CHECK-NEXT:          %{{.*}} = affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
 // CHECK-NEXT:          "foo"(%{{.*}}) : (f32) -> ()
 // CHECK-NEXT:        }
 // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
-// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT:          %{{.*}} = affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK-NEXT:          affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
 // CHECK-NEXT:          "bar"(%{{.*}}) : (f32) -> ()
 // CHECK-NEXT:        }
 // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
-// CHECK-NEXT:          %{{.*}} = "abc_compute"() : () -> f32
-// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT:          %{{.*}} = affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
-// CHECK-NEXT:          %{{.*}} = "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+// CHECK-NEXT:          "abc_compute"() : () -> f32
+// CHECK-NEXT:          affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK-NEXT:          "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
 // CHECK-NEXT:          affine.store %{{.*}}, [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
 // CHECK-NEXT:        }
 // CHECK-NEXT:        "foobar"() : () -> ()
@@ -153,11 +153,11 @@ func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
 // region within a 256 x 8 memref.
 //
 // CHECK-LABEL: func @loop_nest_modulo() {
-// CHECK:       %{{.*}} = alloc() : memref<256x8xf32>
+// CHECK:       alloc() : memref<256x8xf32>
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 step 4 {
-// CHECK-NEXT:      %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}})
-// CHECK-NEXT:      %{{.*}} = alloc() : memref<1x2xf32, 2>
-// CHECK-NEXT:      %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:      affine.apply #map{{[0-9]+}}(%{{.*}})
+// CHECK-NEXT:      alloc() : memref<1x2xf32, 2>
+// CHECK-NEXT:      alloc() : memref<1xi32>
 // Composition of the affine map for '%{{.*}}' causes '%{{.*}}' to be added as a symbol.
 // CHECK-NEXT:      affine.dma_start %{{.*}}[%{{.*}}, 0], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32>
 // CHECK-NEXT:      affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
@@ -191,8 +191,8 @@ func @loop_nest_tiled() -> memref<256x1024xf32> {
   %0 = alloc() : memref<256x1024xf32>
   affine.for %i0 = 0 to 256 step 32 {
     affine.for %i1 = 0 to 1024 step 32 {
-// CHECK:      %{{.*}} = alloc() : memref<32x32xf32, 2>
-// CHECK-NEXT: %{{.*}} = alloc() : memref<1xi32>
+// CHECK:      alloc() : memref<32x32xf32, 2>
+// CHECK-NEXT: alloc() : memref<1xi32>
 // Strided DMA here: 32 x 32 tile in a 256 x 1024 memref.
 // CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<256x1024xf32>, memref<32x32xf32, 2>, memref<1xi32>
 // CHECK-NEXT: affine.dma_wait
@@ -200,7 +200,7 @@ func @loop_nest_tiled() -> memref<256x1024xf32> {
 // CHECK-NEXT:   affine.for %{{.*}} = #map
       affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 32)>(%i0) {
         affine.for %i3 = affine_map<(d0) -> (d0)>(%i1) to affine_map<(d0) -> (d0 + 32)>(%i1) {
-          // CHECK: %{{.*}} = affine.load %{{.*}}[-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<32x32xf32, 2>
+          // CHECK: affine.load %{{.*}}[-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<32x32xf32, 2>
           %1 = affine.load %0[%i2, %i3] : memref<256x1024xf32>
         } // CHECK-NEXT: }
       }
@@ -215,14 +215,14 @@ func @loop_nest_tiled() -> memref<256x1024xf32> {
 func @dma_constant_dim_access(%A : memref<100x100xf32>) {
   %one = constant 1 : index
   %N = constant 100 : index
-  // CHECK:      %{{.*}} = alloc() : memref<1x100xf32, 2>
-  // CHECK-NEXT: %{{.*}} = alloc() : memref<1xi32>
+  // CHECK:      alloc() : memref<1x100xf32, 2>
+  // CHECK-NEXT: alloc() : memref<1xi32>
   // No strided DMA needed here.
   // CHECK:      affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}  : memref<100x100xf32>, memref<1x100xf32, 2>,
   // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
   affine.for %i = 0 to 100 {
     affine.for %j = 0 to affine_map<()[s0] -> (s0)> ()[%N] {
-      // CHECK: %{{.*}} = affine.load %{{.*}}[0, %{{.*}}] : memref<1x100xf32, 2>
+      // CHECK: affine.load %{{.*}}[0, %{{.*}}] : memref<1x100xf32, 2>
       affine.load %A[%one, %j] : memref<100 x 100 x f32>
     }
   }
@@ -243,14 +243,14 @@ func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
     }
   }
   return
-// CHECK:       %{{.*}} = alloc() : memref<100x100xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK:       alloc() : memref<100x100xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[0, symbol(%{{.*}}) + 9], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}}
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 100 {
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 100 {
-// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_SYM_SHIFT]](%{{.*}}, %{{.*}})[%{{.*}}, %{{.*}}]
-// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x100xf32, 2>
+// CHECK-NEXT:      affine.apply [[MAP_SYM_SHIFT]](%{{.*}}, %{{.*}})[%{{.*}}, %{{.*}}]
+// CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x100xf32, 2>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK:       return
@@ -263,8 +263,8 @@ func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: in
   %K = constant 9 : index
 // The buffer size can't be bound by a constant smaller than the original
 // memref size; so the DMA buffer is the entire 100x100.
-// CHECK:       %{{.*}} = alloc() : memref<100x100xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK:       alloc() : memref<100x100xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
   affine.for %i = 0 to 100 {
@@ -286,7 +286,7 @@ func @dma_unknown_size(%arg0: memref<?x?xf32>) {
     affine.for %j = 0 to %N {
       // If this loop nest isn't tiled, the access requires a non-constant DMA
       // size -- not yet implemented.
-      // CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+      // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
       affine.load %arg0[%i, %j] : memref<? x ? x f32>
       // expected-error@-6 {{copy generation failed for one or more memref's in this block}}
     }
@@ -306,7 +306,7 @@ func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
         %idz = affine.apply affine_map<(d0) -> (d0 mod 128)>(%k)
         // DMA with nested striding (or emulating with loop around strided DMA)
         // not yet implemented.
-        // CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<1024x1024x1024xf32>
+        // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<1024x1024x1024xf32>
         %v = affine.load %arg0[%idx, %idy, %idz] : memref<1024 x 1024 x 1024 x f32>
         // expected-error@-10 {{copy generation failed for one or more memref's in this block}}
       }
@@ -351,22 +351,22 @@ func @multi_load_store_union() {
   }
   return
 }
-// CHECK:       %{{.*}} = alloc() : memref<512x512xf32>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<382x446xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK:       alloc() : memref<512x512xf32>
+// CHECK-NEXT:  alloc() : memref<382x446xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<512x512xf32>, memref<382x446xf32, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 256 {
 // CHECK-NEXT:    affine.for %{{.*}} = 0 to 256 {
-// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_64]](%{{.*}})
-// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_128]](%{{.*}})
-// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_2]](%{{.*}})
-// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_2]](%{{.*}})
-// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
-// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2>
-// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_128]](%{{.*}})
-// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_192]](%{{.*}})
+// CHECK-NEXT:      affine.apply [[MAP_PLUS_64]](%{{.*}})
+// CHECK-NEXT:      affine.apply [[MAP_PLUS_128]](%{{.*}})
+// CHECK-NEXT:      affine.apply [[MAP_PLUS_2]](%{{.*}})
+// CHECK-NEXT:      affine.apply [[MAP_PLUS_2]](%{{.*}})
+// CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
+// CHECK-NEXT:      affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2>
+// CHECK-NEXT:      affine.apply [[MAP_PLUS_128]](%{{.*}})
+// CHECK-NEXT:      affine.apply [[MAP_PLUS_192]](%{{.*}})
 // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2>
 // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}} + 126, %{{.*}}] : memref<382x446xf32, 2>
 // CHECK-NEXT:    }
@@ -396,29 +396,29 @@ func @dma_loop_straightline_interspersed() {
 }
 // There are three regions here - the 'load' preceding the loop, the loop
 // itself, and the operations appearing after the loop.
-// CHECK:       %{{.*}} = alloc() : memref<256xf32>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK:       alloc() : memref<256xf32>
+// CHECK-NEXT:  alloc() : memref<1xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<1xf32, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
-// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32, 2>
+// CHECK-NEXT:  affine.load %{{.*}}[0] : memref<1xf32, 2>
 // CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
 // CHECK-NEXT:  dealloc %{{.*}} : memref<1xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<254xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  alloc() : memref<254xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<254xf32, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
 // CHECK-NEXT:  affine.for %{{.*}} = 1 to 255 {
-// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}} - 1] : memref<254xf32, 2>
+// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}} - 1] : memref<254xf32, 2>
 // CHECK-NEXT:  }
 // CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
 // CHECK-NEXT:  dealloc %{{.*}} : memref<254xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<256xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  alloc() : memref<256xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
-// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[255] : memref<256xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.load %{{.*}}[255] : memref<256xf32, 2>
 // CHECK-NEXT:  affine.store %{{.*}}, %{{.*}}[0] : memref<256xf32, 2>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32, 2>, memref<256xf32>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
@@ -449,9 +449,9 @@ func @dma_mixed_loop_blocks() {
 // CHECK:       affine.dma_start [[MEM]][%{{.*}}, %{{.*}}], [[BUF]][%{{.*}}, %{{.*}}], [[TAG]][%{{.*}}], %{{.*}} : memref<256x256xvector<8xf32>>, memref<256x256xvector<8xf32>, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait [[TAG]][%{{.*}}], %{{.*}} : memref<1xi32>
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 256 {
-// CHECK:         %{{.*}} = affine.load [[BUF]][0, 0] : memref<256x256xvector<8xf32>, 2>
+// CHECK:         affine.load [[BUF]][0, 0] : memref<256x256xvector<8xf32>, 2>
 // CHECK:         affine.for %{{.*}} = 0 to 256 {
-// CHECK-NEXT:      %{{.*}} = affine.load [[BUF]][%{{.*}}, %{{.*}}] : memref<256x256xvector<8xf32>, 2>
+// CHECK-NEXT:      affine.load [[BUF]][%{{.*}}, %{{.*}}] : memref<256x256xvector<8xf32>, 2>
 
 // -----
 
@@ -469,7 +469,7 @@ func @relative_loop_bounds(%arg0: memref<1027xf32>) {
 // CHECK-NEXT: [[MEM:%[0-9]+]] = alloc() : memref<1xi32>
 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
 // CHECK-NEXT:    affine.for %{{.*}} = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) {
-// CHECK-NEXT:      %{{.*}} = constant 0.000000e+00 : f32
+// CHECK-NEXT:      constant 0.000000e+00 : f32
 // CHECK-NEXT:      affine.store %{{.*}}, [[BUF]][%{{.*}}] : memref<1027xf32, 2>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
@@ -496,16 +496,16 @@ func @test_read_write_region_union() {
   return
 }
 
-// CHECK:       %{{.*}} = alloc() : memref<256xf32>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<85xf32, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK:       alloc() : memref<256xf32>
+// CHECK-NEXT:  alloc() : memref<85xf32, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<85xf32, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
-// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP_READ_OFFSET]](%{{.*}})
-// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP_WRITE_OFFSET]](%{{.*}})
-// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
+// CHECK-NEXT:    affine.apply [[MAP_READ_OFFSET]](%{{.*}})
+// CHECK-NEXT:    affine.apply [[MAP_WRITE_OFFSET]](%{{.*}})
+// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
 // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<85xf32, 2>
 // CHECK-NEXT:  }
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32>
@@ -567,8 +567,8 @@ func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memre
   return %arg1, %arg2 : memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>
 }
 
-// CHECK:       %{{.*}} = alloc() : memref<4x4x16x1xvector<8x128xf32>, 2>
-// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK:       alloc() : memref<4x4x16x1xvector<8x128xf32>, 2>
+// CHECK-NEXT:  alloc() : memref<1xi32>
 // CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<4x4x16x1xvector<8x128xf32>>, memref<4x4x16x1xvector<8x128xf32>, 2>, memref<1xi32>
 // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
 
diff --git a/mlir/test/Dialect/Linalg/fusion.mlir b/mlir/test/Dialect/Linalg/fusion.mlir
index f844f76a3da6..82ef196d0d97 100644
--- a/mlir/test/Dialect/Linalg/fusion.mlir
+++ b/mlir/test/Dialect/Linalg/fusion.mlir
@@ -41,12 +41,11 @@ func @f1(%A: memref<?x?xf32, offset: 0, strides: [?, 1]>,
 }
 // CHECK-LABEL: func @f1
 // CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
-// No RAW dependences, the pass does not fuse RAR atm.
-// CHECK: linalg.matmul
 // CHECK: loop.for
 // CHECK:   loop.for
 // CHECK:     loop.for
 // CHECK:       linalg.matmul
+// CHECK:       linalg.matmul
 
 // -----
 
@@ -334,15 +333,13 @@ func @f6(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>,
 }
 // CHECK-LABEL: func @f6
 // CHECK:  (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
-// Cannot fuse C due to interleaved read of C that would be bypassed.
-// Cannot fuse E (WAW).
-// CHECK:  linalg.matmul
-// CHECK:  linalg.matmul
+// Fuse the producer of E (WAW) then the producer of C (WAR).
 // CHECK:  loop.for
 // CHECK:    loop.for
 // CHECK:      loop.for
 // CHECK:        linalg.matmul
-// CHECK-NOT:      linalg.matmul
+// CHECK:        linalg.matmul
+// CHECK:        linalg.matmul
 
 // -----
 
@@ -785,3 +782,53 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 // CHECK:       linalg.generic
 // CHECK:         exp
 // CHECK:         linalg.yield
+
+// -----
+
+#map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
+#map1 = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>
+#map2 = affine_map<()[s0] -> (s0 + 3)>
+
+func @fill_and_conv(%arg0: memref<1x4x5x1xf32>, %arg1: memref<2x3x1x1xf32>, %arg2: memref<1x4x5x1xf32>) {
+  %cst = constant 0.000000e+00 : f32
+  linalg.fill(%arg2, %cst) : memref<1x4x5x1xf32>, f32
+
+  %c4 = constant 4 : index
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %4 = dim %arg1, 0 : memref<2x3x1x1xf32>
+  %5 = dim %arg1, 1 : memref<2x3x1x1xf32>
+  %6 = dim %arg0, 0 : memref<1x4x5x1xf32>
+  %7 = dim %arg0, 1 : memref<1x4x5x1xf32>
+  %8 = dim %arg0, 3 : memref<1x4x5x1xf32>
+  %9 = dim %arg2, 0 : memref<1x4x5x1xf32>
+  %10 = dim %arg2, 1 : memref<1x4x5x1xf32>
+  %11 = dim %arg2, 2 : memref<1x4x5x1xf32>
+  %12 = dim %arg2, 3 : memref<1x4x5x1xf32>
+  %13 = linalg.range %c0 : %6 : %c2 : !linalg.range
+  %14 = linalg.range %c0 : %10 : %c3 : !linalg.range
+  loop.for %arg3 = %c0 to %6 step %c2 {
+    loop.for %arg4 = %c0 to %10 step %c3 {
+      %15 = affine.min #map0(%c2, %c1, %arg3)
+      %16 = affine.apply #map2()[%7]
+      %17 = affine.min #map0(%16, %c4, %arg4)
+      %18 = dim %arg0, 2 : memref<1x4x5x1xf32>
+      %19 = dim %arg0, 3 : memref<1x4x5x1xf32>
+      %20 = subview %arg0[%arg3, %arg4, %c0, %c0] [%15, %17, %18, %19] [%c1, %c1, %c1, %c1] : memref<1x4x5x1xf32> to memref<?x?x?x?xf32, #map1>
+      %21 = affine.min #map0(%c2, %c1, %arg3)
+      %22 = affine.min #map0(%c3, %c4, %arg4)
+      %23 = dim %arg2, 2 : memref<1x4x5x1xf32>
+      %24 = dim %arg2, 3 : memref<1x4x5x1xf32>
+      %25 = subview %arg2[%arg3, %arg4, %c0, %c0] [%21, %22, %23, %24] [%c1, %c1, %c1, %c1] : memref<1x4x5x1xf32> to memref<?x?x?x?xf32, #map1>
+      linalg.conv(%arg1, %20, %25) {dilations = [1, 1], strides = [1, 1]} : memref<2x3x1x1xf32>, memref<?x?x?x?xf32, #map1>, memref<?x?x?x?xf32, #map1>
+    }
+  }
+  return
+}
+// CHECK-LABEL: func @fill_and_conv
+// CHECK: loop.for
+// CHECK:   loop.for
+// CHECK:     linalg.fill
+// CHECK:     linalg.conv
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 59e4a764afcc..7a8291504ae6 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -513,3 +513,14 @@ func @reshape(%arg0: memref<?x?x?xf32>) {
   %0 = linalg.reshape %arg0 [affine_map<(i, j, k) -> (i, j)>, affine_map<(i, j, k) -> (k)>] :
     memref<?x?x?xf32> into memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * s0 + d1)>>
 }
+
+// -----
+
+func @pooling_rank_mismatch(%arg0: memref<?x?x?xf32>,
+                            %arg1: memref<2x3xf32>,
+                            %arg2: memref<?x?x?xf32>) {
+  // expected-error @+1 {{expects memref ranks to match}}
+  linalg.pooling_max(%arg0, %arg1, %arg2) {strides = [2, 1, 2]}:
+    memref<?x?x?xf32>, memref<2x3xf32>, memref<?x?x?xf32>
+  return
+}
diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
index c8d114bee6ae..1bd0cf61dd24 100644
--- a/mlir/test/Dialect/Linalg/loops.mlir
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -9,6 +9,7 @@
 // CHECK-DAG: #[[strided4D:.*]] = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)>
 // CHECK-DAG: #[[clampMinMap:.*]] = affine_map<(d0) -> (d0, 0)>
 
+// CHECK-DAG: #[[Stride1Dilation1:.*]] = affine_map<(d0, d1) -> (d0  + d1)>
 // CHECK-DAG: #[[Stride2Dilation1:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1)>
 // CHECK-DAG: #[[Stride2Dilation4:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1 * 4)>
 // CHECK-DAG: #[[Stride3Dilation5:.*]] = affine_map<(d0, d1) -> (d0 * 3 + d1 * 5)>
@@ -251,6 +252,75 @@ func @conv_padding(%arg0: memref<?x?x?x?xf32>,
 //       CHECK:                 %{{.*}} = addf %{{.*}}, %{{.*}} : f32
 //       CHECK:                 store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?x?xf32>
 
+func @pooling_max(%arg0: memref<?x?xf32>,
+                  %arg1: memref<?x?xi32>,
+                  %arg2: memref<?x?xf32>) {
+  linalg.pooling_max(%arg0, %arg1, %arg2) { strides = [2, 1] }:
+    memref<?x?xf32>, memref<?x?xi32>, memref<?x?xf32>
+  return
+}
+// CHECK-LABEL: func @pooling_max
+//       CHECK:   %[[WX:.*]] = dim %arg1, 0 : memref<?x?xi32>
+//       CHECK:   %[[WY:.*]] = dim %arg1, 1 : memref<?x?xi32>
+//       CHECK:   %[[OX:.*]] = dim %arg2, 0 : memref<?x?xf32>
+//       CHECK:   %[[OY:.*]] = dim %arg2, 1 : memref<?x?xf32>
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[OX]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[OY]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[WX]] step %{{.*}} {
+//       CHECK:         loop.for %{{.*}} = %{{.*}} to %[[WY]] step %{{.*}} {
+//       CHECK:           %[[IX:.*]] = affine.apply #[[Stride2Dilation1]](%{{.*}}, %{{.*}})
+//       CHECK:           %[[IY:.*]] = affine.apply #[[Stride1Dilation1]](%{{.*}}, %{{.*}})
+//       CHECK:           %{{.*}} = load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+//       CHECK:           %{{.*}} = load %{{.*}}[%[[IX]], %[[IY]]] : memref<?x?xf32>
+//       CHECK:           %[[RES:.*]] = select %{{.*}}, %{{.*}}, %{{.*}} : f32
+//       CHECK:           store %[[RES]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+
+func @pooling_min(%arg0: memref<?x?xf32>,
+                  %arg1: memref<?x?xi32>,
+                  %arg2: memref<?x?xf32>) {
+  linalg.pooling_min(%arg0, %arg1, %arg2) { strides = [2, 1] }:
+    memref<?x?xf32>, memref<?x?xi32>, memref<?x?xf32>
+  return
+}
+// CHECK-LABEL: func @pooling_min
+//       CHECK:   %[[WX:.*]] = dim %arg1, 0 : memref<?x?xi32>
+//       CHECK:   %[[WY:.*]] = dim %arg1, 1 : memref<?x?xi32>
+//       CHECK:   %[[OX:.*]] = dim %arg2, 0 : memref<?x?xf32>
+//       CHECK:   %[[OY:.*]] = dim %arg2, 1 : memref<?x?xf32>
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[OX]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[OY]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[WX]] step %{{.*}} {
+//       CHECK:         loop.for %{{.*}} = %{{.*}} to %[[WY]] step %{{.*}} {
+//       CHECK:           %[[IX:.*]] = affine.apply #[[Stride2Dilation1]](%{{.*}}, %{{.*}})
+//       CHECK:           %[[IY:.*]] = affine.apply #[[Stride1Dilation1]](%{{.*}}, %{{.*}})
+//       CHECK:           %{{.*}} = load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+//       CHECK:           %{{.*}} = load %{{.*}}[%[[IX]], %[[IY]]] : memref<?x?xf32>
+//       CHECK:           %[[RES:.*]] = select %{{.*}}, %{{.*}}, %{{.*}} : f32
+//       CHECK:           store %[[RES]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+
+func @pooling_sum(%arg0: memref<?x?xf32>,
+                  %arg1: memref<?x?xi32>,
+                  %arg2: memref<?x?xf32>) {
+  linalg.pooling_sum(%arg0, %arg1, %arg2) { strides = [2, 1] }:
+    memref<?x?xf32>, memref<?x?xi32>, memref<?x?xf32>
+  return
+}
+// CHECK-LABEL: func @pooling_sum
+//       CHECK:   %[[WX:.*]] = dim %arg1, 0 : memref<?x?xi32>
+//       CHECK:   %[[WY:.*]] = dim %arg1, 1 : memref<?x?xi32>
+//       CHECK:   %[[OX:.*]] = dim %arg2, 0 : memref<?x?xf32>
+//       CHECK:   %[[OY:.*]] = dim %arg2, 1 : memref<?x?xf32>
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[OX]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[OY]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[WX]] step %{{.*}} {
+//       CHECK:         loop.for %{{.*}} = %{{.*}} to %[[WY]] step %{{.*}} {
+//       CHECK:           %[[IX:.*]] = affine.apply #[[Stride2Dilation1]](%{{.*}}, %{{.*}})
+//       CHECK:           %[[IY:.*]] = affine.apply #[[Stride1Dilation1]](%{{.*}}, %{{.*}})
+//       CHECK:           %[[RHS:.*]] = load %{{.*}}[%[[IX]], %[[IY]]] : memref<?x?xf32>
+//       CHECK:           %[[LHS:.*]] = load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+//       CHECK:           %[[RES:.*]] = addf %[[LHS]], %[[RHS]] : f32
+//       CHECK:           store %[[RES]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+
 func @foo(%0: f32, %1: f32, %2: f32) -> (f32, f32) {
   %f0 = constant 0.0 : f32
   return %f0, %f0 : f32, f32
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
index 468fad45dd90..05d35f8f43e4 100644
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -244,6 +244,48 @@ func @conv_padding(%arg0: memref<?x?x?x?xf32>,
 
 // -----
 
+func @pooling_max(%arg0: memref<?x?x?xf32>,
+                  %arg1: memref<?x?x?xi32>,
+                  %arg2: memref<?x?x?xf32>) {
+  linalg.pooling_max(%arg0, %arg1, %arg2) {strides = [2, 1, 2]}:
+    memref<?x?x?xf32>, memref<?x?x?xi32>, memref<?x?x?xf32>
+  return
+}
+// CHECK-LABEL: func @pooling_max
+//       CHECK:   linalg.pooling_max(%{{.*}}, %{{.*}}, %{{.*}})
+//  CHECK-SAME:   {strides = [2, 1, 2]}
+//  CHECK-SAME:   memref<?x?x?xf32>, memref<?x?x?xi32>, memref<?x?x?xf32>
+
+// -----
+
+func @pooling_min(%arg0: memref<?x?x?xf32>,
+                  %arg1: memref<?x?x?xi32>,
+                  %arg2: memref<?x?x?xf32>) {
+  linalg.pooling_min(%arg0, %arg1, %arg2) {strides = [2, 1, 2]}:
+    memref<?x?x?xf32>, memref<?x?x?xi32>, memref<?x?x?xf32>
+  return
+}
+// CHECK-LABEL: func @pooling_min
+//       CHECK:   linalg.pooling_min(%{{.*}}, %{{.*}}, %{{.*}})
+//  CHECK-SAME:   {strides = [2, 1, 2]}
+//  CHECK-SAME:   memref<?x?x?xf32>, memref<?x?x?xi32>, memref<?x?x?xf32>
+
+// -----
+
+func @pooling_sum(%arg0: memref<?x?x?xf32>,
+                  %arg1: memref<?x?x?xi32>,
+                  %arg2: memref<?x?x?xf32>) {
+  linalg.pooling_sum(%arg0, %arg1, %arg2) {strides = [2, 1, 2]}:
+    memref<?x?x?xf32>, memref<?x?x?xi32>, memref<?x?x?xf32>
+  return
+}
+// CHECK-LABEL: func @pooling_sum
+//       CHECK:   linalg.pooling_sum(%{{.*}}, %{{.*}}, %{{.*}})
+//  CHECK-SAME:   {strides = [2, 1, 2]}
+//  CHECK-SAME:   memref<?x?x?xf32>, memref<?x?x?xi32>, memref<?x?x?xf32>
+
+// -----
+
 // CHECK-DAG: #[[strided2D:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
 // CHECK-DAG: #[[strided3D:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>
 
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 8dca47515ced..3ff4052a22b8 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -87,3 +87,72 @@ func @strided_slice_of_constant_mask() -> (vector<2x1xi1>) {
   // CHECK: vector.constant_mask [1, 1] : vector<2x1xi1>
   return %1 : vector<2x1xi1>
 }
+
+// -----
+
+// CHECK-LABEL: transpose_1D_identity
+// CHECK-SAME: ([[ARG:%.*]]: vector<4xf32>)
+func @transpose_1D_identity(%arg : vector<4xf32>) -> vector<4xf32> {
+  // CHECK-NOT: transpose
+  %0 = vector.transpose %arg, [0] : vector<4xf32> to vector<4xf32>
+  // CHECK-NEXT: return [[ARG]]
+  return %0 : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: transpose_2D_identity
+// CHECK-SAME: ([[ARG:%.*]]: vector<4x3xf32>)
+func @transpose_2D_identity(%arg : vector<4x3xf32>) -> vector<4x3xf32> {
+  // CHECK-NOT: transpose
+  %0 = vector.transpose %arg, [0, 1] : vector<4x3xf32> to vector<4x3xf32>
+  // CHECK-NEXT: return [[ARG]]
+  return %0 : vector<4x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: transpose_3D_identity
+// CHECK-SAME: ([[ARG:%.*]]: vector<4x3x2xf32>)
+func @transpose_3D_identity(%arg : vector<4x3x2xf32>) -> vector<4x3x2xf32> {
+  // CHECK-NOT: transpose
+  %0 = vector.transpose %arg, [0, 1, 2] : vector<4x3x2xf32> to vector<4x3x2xf32>
+  // CHECK-NEXT: return [[ARG]]
+  return %0 : vector<4x3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: transpose_2D_sequence
+// CHECK-SAME: ([[ARG:%.*]]: vector<4x3xf32>)
+func @transpose_2D_sequence(%arg : vector<4x3xf32>) -> vector<3x4xf32> {
+  // CHECK-NOT: transpose
+  %0 = vector.transpose %arg, [0, 1] : vector<4x3xf32> to vector<4x3xf32>
+  // CHECK: [[T1:%.*]] = vector.transpose [[ARG]], [1, 0]
+  %1 = vector.transpose %0, [1, 0] : vector<4x3xf32> to vector<3x4xf32>
+  // CHECK-NOT: transpose
+  %2 = vector.transpose %1, [0, 1] : vector<3x4xf32> to vector<3x4xf32>
+  // CHECK: [[ADD:%.*]] = addf [[T1]], [[T1]]
+  %4 = addf %1, %2 : vector<3x4xf32>
+  // CHECK-NEXT: return [[ADD]]
+  return %4 : vector<3x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: transpose_3D_sequence
+// CHECK-SAME: ([[ARG:%.*]]: vector<4x3x2xf32>)
+func @transpose_3D_sequence(%arg : vector<4x3x2xf32>) -> vector<2x3x4xf32> {
+  // CHECK: [[T0:%.*]] = vector.transpose [[ARG]], [1, 2, 0]
+  %0 = vector.transpose %arg, [1, 2, 0] : vector<4x3x2xf32> to vector<3x2x4xf32>
+  // CHECK-NOT: transpose
+  %1 = vector.transpose %0, [0, 1, 2] : vector<3x2x4xf32> to vector<3x2x4xf32>
+  // CHECK: [[T2:%.*]] = vector.transpose [[T0]], [1, 0, 2]
+  %2 = vector.transpose %1, [1, 0, 2] : vector<3x2x4xf32> to vector<2x3x4xf32>
+  // CHECK: [[ADD:%.*]] = addf [[T2]], [[T2]]
+  %3 = addf %2, %2 : vector<2x3x4xf32>
+  // CHECK-NOT: transpose
+  %4 = vector.transpose %3, [0, 1, 2] : vector<2x3x4xf32> to vector<2x3x4xf32>
+  // CHECK-NEXT: return [[ADD]]
+  return %4 : vector<2x3x4xf32>
+}
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index 5873532fc254..494bfa45345c 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -loop-invariant-code-motion -split-input-file | FileCheck %s
+// RUN: mlir-opt %s  -split-input-file -loop-invariant-code-motion | FileCheck %s
 
 func @nested_loops_both_having_invariant_code() {
   %m = alloc() : memref<10xf32>
@@ -25,6 +25,8 @@ func @nested_loops_both_having_invariant_code() {
   return
 }
 
+// -----
+
 func @nested_loops_code_invariant_to_both() {
   %m = alloc() : memref<10xf32>
   %cf7 = constant 7.0 : f32
@@ -44,6 +46,8 @@ func @nested_loops_code_invariant_to_both() {
   return
 }
 
+// -----
+
 func @single_loop_nothing_invariant() {
   %m1 = alloc() : memref<10xf32>
   %m2 = alloc() : memref<10xf32>
@@ -65,6 +69,8 @@ func @single_loop_nothing_invariant() {
   return
 }
 
+// -----
+
 func @invariant_code_inside_affine_if() {
   %m = alloc() : memref<10xf32>
   %cf8 = constant 8.0 : f32
@@ -81,7 +87,7 @@ func @invariant_code_inside_affine_if() {
   // CHECK: %0 = alloc() : memref<10xf32>
   // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
   // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
-  // CHECK-NEXT: %1 = affine.apply #map3(%arg0)
+  // CHECK-NEXT: %1 = affine.apply #map0(%arg0)
   // CHECK-NEXT: affine.if #set0(%arg0, %1) {
   // CHECK-NEXT: %2 = addf %cst, %cst : f32
   // CHECK-NEXT: affine.store %2, %0[%arg0] : memref<10xf32>
@@ -91,6 +97,8 @@ func @invariant_code_inside_affine_if() {
   return
 }
 
+// -----
+
 func @invariant_affine_if() {
   %m = alloc() : memref<10xf32>
   %cf8 = constant 8.0 : f32
@@ -114,6 +122,8 @@ func @invariant_affine_if() {
   return
 }
 
+// -----
+
 func @invariant_affine_if2() {
   %m = alloc() : memref<10xf32>
   %cf8 = constant 8.0 : f32
@@ -139,6 +149,8 @@ func @invariant_affine_if2() {
   return
 }
 
+// -----
+
 func @invariant_affine_nested_if() {
   %m = alloc() : memref<10xf32>
   %cf8 = constant 8.0 : f32
@@ -169,6 +181,8 @@ func @invariant_affine_nested_if() {
   return
 }
 
+// -----
+
 func @invariant_affine_nested_if_else() {
   %m = alloc() : memref<10xf32>
   %cf8 = constant 8.0 : f32
@@ -205,6 +219,8 @@ func @invariant_affine_nested_if_else() {
   return
 }
 
+// -----
+
 func @invariant_loop_dialect() {
   %ci0 = constant 0 : index
   %ci10 = constant 10 : index
@@ -226,6 +242,8 @@ func @invariant_loop_dialect() {
   return
 }
 
+// -----
+
 func @variant_loop_dialect() {
   %ci0 = constant 0 : index
   %ci10 = constant 10 : index
@@ -244,3 +262,33 @@ func @variant_loop_dialect() {
 
   return
 }
+
+// -----
+
+func @parallel_loop_with_invariant() {
+  %c0 = constant 0 : index
+  %c10 = constant 10 : index
+  %c1 = constant 1 : index
+  %c7 = constant 7 : i32
+  %c8 = constant 8 : i32
+  loop.parallel (%arg0, %arg1) = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
+      %v0 = addi %c7, %c8 : i32
+      %v3 = addi %arg0, %arg1 : index
+  }
+
+  // CHECK-LABEL: func @parallel_loop_with_invariant
+  // CHECK: %c0 = constant 0 : index
+  // CHECK-NEXT: %c10 = constant 10 : index
+  // CHECK-NEXT: %c1 = constant 1 : index
+  // CHECK-NEXT: %c7_i32 = constant 7 : i32
+  // CHECK-NEXT: %c8_i32 = constant 8 : i32
+  // CHECK-NEXT: addi %c7_i32, %c8_i32 : i32
+  // CHECK-NEXT: loop.parallel (%arg0, %arg1) = (%c0, %c0) to (%c10, %c10) step (%c1, %c1)
+  // CHECK-NEXT:   addi %arg0, %arg1 : index
+  // CHECK-NEXT:   yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+
+  return
+}
+
diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir
index c674ef796ad2..6fcb78cf4c56 100644
--- a/mlir/test/Transforms/parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir
@@ -1,17 +1,6 @@
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='func(parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize)' | FileCheck %s
 
-// CHECK-LABEL:   func @parallel_many_dims() {
 func @parallel_many_dims() {
-  // CHECK:           [[VAL_0:%.*]] = constant 6 : index
-  // CHECK:           [[VAL_1:%.*]] = constant 7 : index
-  // CHECK:           [[VAL_2:%.*]] = constant 9 : index
-  // CHECK:           [[VAL_3:%.*]] = constant 10 : index
-  // CHECK:           [[VAL_4:%.*]] = constant 12 : index
-  // CHECK:           [[VAL_5:%.*]] = constant 13 : index
-  // CHECK:           [[VAL_6:%.*]] = constant 3 : index
-  // CHECK:           [[VAL_7:%.*]] = constant 0 : index
-  // CHECK:           [[VAL_8:%.*]] = constant 1 : index
-  // CHECK:           [[VAL_9:%.*]] = constant 2 : index
   %c0 = constant 0 : index
   %c1 = constant 1 : index
   %c2 = constant 2 : index
@@ -28,25 +17,35 @@ func @parallel_many_dims() {
   %c13 = constant 13 : index
   %c14 = constant 14 : index
 
-  // CHECK:           loop.parallel ([[VAL_10:%.*]], [[VAL_11:%.*]], [[VAL_12:%.*]]) = ([[VAL_7]], [[VAL_7]], [[VAL_7]]) to ([[VAL_9]], [[VAL_8]], [[VAL_8]]) step ([[VAL_8]], [[VAL_8]], [[VAL_8]]) {
   loop.parallel (%i0, %i1, %i2, %i3, %i4) = (%c0, %c3, %c6, %c9, %c12) to (%c2, %c5, %c8, %c11, %c14)
                                           step (%c1, %c4, %c7, %c10, %c13) {
-    // CHECK:             [[VAL_13:%.*]] = remi_signed [[VAL_10]], [[VAL_9]] : index
-    // CHECK:             [[VAL_14:%.*]] = divi_signed [[VAL_10]], [[VAL_8]] : index
-    // CHECK:             [[VAL_15:%.*]] = divi_signed [[VAL_11]], [[VAL_8]] : index
-    // CHECK:             [[VAL_16:%.*]] = muli [[VAL_15]], [[VAL_5]] : index
-    // CHECK:             [[VAL_17:%.*]] = addi [[VAL_16]], [[VAL_4]] : index
-    // CHECK:             [[VAL_18:%.*]] = muli [[VAL_14]], [[VAL_3]] : index
-    // CHECK:             [[VAL_19:%.*]] = addi [[VAL_18]], [[VAL_2]] : index
-    // CHECK:             [[VAL_20:%.*]] = muli [[VAL_12]], [[VAL_1]] : index
-    // CHECK:             [[VAL_21:%.*]] = addi [[VAL_20]], [[VAL_0]] : index
-    // CHECK:             [[VAL_22:%.*]] = "magic.op"([[VAL_13]], [[VAL_6]], [[VAL_21]], [[VAL_19]], [[VAL_17]]) : (index, index, index, index, index) -> index
     %result = "magic.op"(%i0, %i1, %i2, %i3, %i4): (index, index, index, index, index) -> index
   }
   return
 }
-// CHECK:             loop.yield
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
 
+// CHECK-LABEL: func @parallel_many_dims() {
+// CHECK:         [[C6:%.*]] = constant 6 : index
+// CHECK:         [[C7:%.*]] = constant 7 : index
+// CHECK:         [[C9:%.*]] = constant 9 : index
+// CHECK:         [[C10:%.*]] = constant 10 : index
+// CHECK:         [[C12:%.*]] = constant 12 : index
+// CHECK:         [[C13:%.*]] = constant 13 : index
+// CHECK:         [[C3:%.*]] = constant 3 : index
+// CHECK:         [[C0:%.*]] = constant 0 : index
+// CHECK:         [[C1:%.*]] = constant 1 : index
+// CHECK:         [[C2:%.*]] = constant 2 : index
+// CHECK:         loop.parallel ([[NEW_I0:%.*]], [[NEW_I1:%.*]], [[NEW_I2:%.*]]) = ([[C0]], [[C0]], [[C0]]) to ([[C2]], [[C1]], [[C1]]) step ([[C1]], [[C1]], [[C1]]) {
+// CHECK:           [[I0:%.*]] = remi_signed [[NEW_I0]], [[C2]] : index
+// CHECK:           [[I3_COUNT:%.*]] = divi_signed [[NEW_I0]], [[C1]] : index
+// CHECK:           [[I4_COUNT:%.*]] = divi_signed [[NEW_I1]], [[C1]] : index
+// CHECK:           [[VAL_16:%.*]] = muli [[I4_COUNT]], [[C13]] : index
+// CHECK:           [[I4:%.*]] = addi [[VAL_16]], [[C12]] : index
+// CHECK:           [[VAL_18:%.*]] = muli [[I3_COUNT]], [[C10]] : index
+// CHECK:           [[I3:%.*]] = addi [[VAL_18]], [[C9]] : index
+// CHECK:           [[VAL_20:%.*]] = muli [[NEW_I2]], [[C7]] : index
+// CHECK:           [[I2:%.*]] = addi [[VAL_20]], [[C6]] : index
+// CHECK:           "magic.op"([[I0]], [[C3]], [[I2]], [[I3]], [[I4]]) : (index, index, index, index, index) -> index
+// CHECK:           loop.yield
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return
diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
index d769300e3887..6662411eaf14 100644
--- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
@@ -1,36 +1,34 @@
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='func(parallel-loop-collapsing{collapsed-indices-0=0,1}, canonicalize)' | FileCheck %s
 
-// CHECK-LABEL:   func @collapse_to_single() {
 func @collapse_to_single() {
-  // CHECK:           [[VAL_0:%.*]] = constant 7 : index
-  // CHECK:           [[VAL_1:%.*]] = constant 4 : index
-  // CHECK:           [[VAL_2:%.*]] = constant 18 : index
-  // CHECK:           [[VAL_3:%.*]] = constant 3 : index
-  // CHECK:           [[VAL_4:%.*]] = constant 6 : index
-  // CHECK:           [[VAL_5:%.*]] = constant 0 : index
-  // CHECK:           [[VAL_6:%.*]] = constant 1 : index
   %c0 = constant 3 : index
   %c1 = constant 7 : index
   %c2 = constant 11 : index
   %c3 = constant 29 : index
   %c4 = constant 3 : index
   %c5 = constant 4 : index
-  // CHECK:           loop.parallel ([[VAL_7:%.*]]) = ([[VAL_5]]) to ([[VAL_2]]) step ([[VAL_6]]) {
   loop.parallel (%i0, %i1) = (%c0, %c1) to (%c2, %c3) step (%c4, %c5) {
-    // CHECK:             [[VAL_8:%.*]] = remi_signed [[VAL_7]], [[VAL_3]] : index
-    // CHECK:             [[VAL_9:%.*]] = divi_signed [[VAL_7]], [[VAL_4]] : index
-    // CHECK:             [[VAL_10:%.*]] = muli [[VAL_9]], [[VAL_1]] : index
-    // CHECK:             [[VAL_11:%.*]] = addi [[VAL_10]], [[VAL_0]] : index
-    // CHECK:             [[VAL_12:%.*]] = muli [[VAL_8]], [[VAL_3]] : index
-    // CHECK:             [[VAL_13:%.*]] = addi [[VAL_12]], [[VAL_3]] : index
-    // CHECK:             [[VAL_14:%.*]] = "magic.op"([[VAL_13]], [[VAL_11]]) : (index, index) -> index
     %result = "magic.op"(%i0, %i1): (index, index) -> index
   }
   return
 }
-// CHECK:             loop.yield
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
-
 
+// CHECK-LABEL: func @collapse_to_single() {
+// CHECK:         [[C7:%.*]] = constant 7 : index
+// CHECK:         [[C4:%.*]] = constant 4 : index
+// CHECK:         [[C18:%.*]] = constant 18 : index
+// CHECK:         [[C3:%.*]] = constant 3 : index
+// CHECK:         [[C6:%.*]] = constant 6 : index
+// CHECK:         [[C0:%.*]] = constant 0 : index
+// CHECK:         [[C1:%.*]] = constant 1 : index
+// CHECK:         loop.parallel ([[NEW_I:%.*]]) = ([[C0]]) to ([[C18]]) step ([[C1]]) {
+// CHECK:           [[I0_COUNT:%.*]] = remi_signed [[NEW_I]], [[C3]] : index
+// CHECK:           [[I1_COUNT:%.*]] = divi_signed [[NEW_I]], [[C6]] : index
+// CHECK:           [[VAL_10:%.*]] = muli [[I1_COUNT]], [[C4]] : index
+// CHECK:           [[I1:%.*]] = addi [[VAL_10]], [[C7]] : index
+// CHECK:           [[VAL_12:%.*]] = muli [[I0_COUNT]], [[C3]] : index
+// CHECK:           [[I0:%.*]] = addi [[VAL_12]], [[C3]] : index
+// CHECK:           "magic.op"([[I0]], [[I1]]) : (index, index) -> index
+// CHECK:           loop.yield
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 2ae36d466dd4..60bf5a1d87f3 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -47,16 +47,6 @@ target_link_libraries(MLIRMlirOptMain
 add_llvm_tool(mlir-opt
  mlir-opt.cpp
 )
-# Manually expand the target library, since our MLIR libraries
-# aren't plugged into the LLVM dependency tracking. If we don't
-# do this then we can't insert the CodeGen library after ourselves
-llvm_expand_pseudo_components(TARGET_LIBS AllTargetsCodeGens)
-# Prepend LLVM in front of every target, this is how the library
-# are named with CMake
-SET(targets_to_link)
-FOREACH(t ${TARGET_LIBS})
-  LIST(APPEND targets_to_link "LLVM${t}")
-ENDFOREACH(t)
 
 llvm_update_compile_flags(mlir-opt)
 target_link_libraries(mlir-opt PRIVATE ${LIBS} ${targets_to_link})
diff --git a/mlir/tools/mlir-tblgen/CMakeLists.txt b/mlir/tools/mlir-tblgen/CMakeLists.txt
index b7628cff11f8..19e6230e54ac 100644
--- a/mlir/tools/mlir-tblgen/CMakeLists.txt
+++ b/mlir/tools/mlir-tblgen/CMakeLists.txt
@@ -13,6 +13,8 @@ add_tablegen(mlir-tblgen MLIR
   OpDocGen.cpp
   OpFormatGen.cpp
   OpInterfacesGen.cpp
+  PassGen.cpp
+  PassDocGen.cpp
   RewriterGen.cpp
   SPIRVUtilsGen.cpp
   StructsGen.cpp
diff --git a/mlir/tools/mlir-tblgen/PassDocGen.cpp b/mlir/tools/mlir-tblgen/PassDocGen.cpp
new file mode 100644
index 000000000000..d55468a0f386
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/PassDocGen.cpp
@@ -0,0 +1,80 @@
+//===- PassDocGen.cpp - MLIR pass documentation generator -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// PassDocGen uses the description of passes to generate documentation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DocGenUtilities.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Pass.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+/// Emit the documentation for the given pass.
+static void emitDoc(const Pass &pass, raw_ostream &os) {
+  os << llvm::formatv("### `-{0}`: {1}\n", pass.getArgument(),
+                      pass.getSummary());
+  emitDescription(pass.getDescription(), os);
+
+  // Handle the options of the pass.
+  ArrayRef<PassOption> options = pass.getOptions();
+  if (!options.empty()) {
+    os << "\n#### Options\n```\n";
+    size_t longestOption = 0;
+    for (const PassOption &option : options)
+      longestOption = std::max(option.getArgument().size(), longestOption);
+    for (const PassOption &option : options) {
+      os << "-" << option.getArgument();
+      os.indent(longestOption - option.getArgument().size())
+          << " : " << option.getDescription() << "\n";
+    }
+    os << "```\n";
+  }
+
+  // Handle the statistics of the pass.
+  ArrayRef<PassStatistic> stats = pass.getStatistics();
+  if (!stats.empty()) {
+    os << "\n#### Statistics\n```\n";
+    size_t longestStat = 0;
+    for (const PassStatistic &stat : stats)
+      longestStat = std::max(stat.getName().size(), longestStat);
+    for (const PassStatistic &stat : stats) {
+      os << stat.getName();
+      os.indent(longestStat - stat.getName().size())
+          << " : " << stat.getDescription() << "\n";
+    }
+    os << "```\n";
+  }
+}
+
+static void emitDocs(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) {
+  os << "<!-- Autogenerated by mlir-tblgen; don't manually edit -->\n";
+  auto passDefs = recordKeeper.getAllDerivedDefinitions("Pass");
+
+  // Collect the registered passes, sorted by argument name.
+  SmallVector<Pass, 16> passes(passDefs.begin(), passDefs.end());
+  SmallVector<Pass *, 16> sortedPasses(llvm::make_pointer_range(passes));
+  llvm::array_pod_sort(sortedPasses.begin(), sortedPasses.end(),
+                       [](Pass *const *lhs, Pass *const *rhs) {
+                         return (*lhs)->getArgument().compare(
+                             (*rhs)->getArgument());
+                       });
+  for (Pass *pass : sortedPasses)
+    emitDoc(*pass, os);
+}
+
+static mlir::GenRegistration
+    genRegister("gen-pass-doc", "Generate pass documentation",
+                [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                  emitDocs(records, os);
+                  return false;
+                });
diff --git a/mlir/tools/mlir-tblgen/PassGen.cpp b/mlir/tools/mlir-tblgen/PassGen.cpp
new file mode 100644
index 000000000000..72ef5a653369
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/PassGen.cpp
@@ -0,0 +1,121 @@
+//===- Pass.cpp - MLIR pass registration generator ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// PassGen uses the description of passes to generate base classes for passes
+// and command line registration.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Pass.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+//===----------------------------------------------------------------------===//
+// GEN: Pass base class generation
+//===----------------------------------------------------------------------===//
+
+/// The code snippet used to generate the start of a pass base class.
+///
+/// {0}: The def name of the pass record.
+/// {1}: The command line argument for the pass.
+const char *const passDeclBegin = R"(
+//===----------------------------------------------------------------------===//
+// {0}
+//===----------------------------------------------------------------------===//
+#ifdef GEN_PASS_{0}
+  /// Returns the command-line argument attached to this pass.
+  static StringRef getPassArgument() { return "{1}"; }
+)";
+
+/// The code snippet used to generate the end of a pass base class.
+///
+/// {0}: The def name of the pass record.
+const char *const passDeclEnd = R"(
+#undef GEN_PASS_{0}
+#endif // GEN_PASS_{0}
+)";
+
+/// Emit the declarations for each of the pass options.
+static void emitPassOptionDecls(const Pass &pass, raw_ostream &os) {
+  for (const PassOption &opt : pass.getOptions()) {
+    os.indent(2) << "Pass::" << (opt.isListOption() ? "ListOption" : "Option");
+
+    os << llvm::formatv("<{0}> {1}{{*this, \"{2}\", llvm::cl::desc(\"{3}\")",
+                        opt.getType(), opt.getCppVariableName(),
+                        opt.getArgument(), opt.getDescription());
+    if (Optional<StringRef> defaultVal = opt.getDefaultValue())
+      os << ", llvm::cl::init(" << defaultVal << ")";
+    if (Optional<StringRef> additionalFlags = opt.getAdditionalFlags())
+      os << ", " << *additionalFlags;
+    os << "};\n";
+  }
+}
+
+/// Emit the declarations for each of the pass statistics.
+static void emitPassStatisticDecls(const Pass &pass, raw_ostream &os) {
+  for (const PassStatistic &stat : pass.getStatistics()) {
+    os << llvm::formatv("  Pass::Statistic {0}{{this, \"{1}\", \"{2}\"};\n",
+                        stat.getCppVariableName(), stat.getName(),
+                        stat.getDescription());
+  }
+}
+
+static void emitPassDecl(const Pass &pass, raw_ostream &os) {
+  StringRef defName = pass.getDef()->getName();
+  os << llvm::formatv(passDeclBegin, defName, pass.getArgument());
+  emitPassOptionDecls(pass, os);
+  emitPassStatisticDecls(pass, os);
+  os << llvm::formatv(passDeclEnd, defName);
+}
+
+//===----------------------------------------------------------------------===//
+// GEN: Pass registration generation
+//===----------------------------------------------------------------------===//
+
+/// Emit the code for registering each of the given passes with the global
+/// PassRegistry.
+static void emitRegistration(ArrayRef<Pass> passes, raw_ostream &os) {
+  os << "#ifdef GEN_PASS_REGISTRATION\n";
+  for (const Pass &pass : passes) {
+    os << llvm::formatv("::mlir::registerPass(\"{0}\", \"{1}\", []() -> "
+                        "std::unique_ptr<Pass> {{ return {2}; });\n",
+                        pass.getArgument(), pass.getSummary(),
+                        pass.getConstructor());
+  }
+  os << "#undef GEN_PASS_REGISTRATION\n";
+  os << "#endif // GEN_PASS_REGISTRATION\n";
+}
+
+//===----------------------------------------------------------------------===//
+// GEN: Registration hooks
+//===----------------------------------------------------------------------===//
+
+static void emitDecls(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) {
+  os << "/* Autogenerated by mlir-tblgen; don't manually edit */\n";
+
+  std::vector<Pass> passes;
+  for (const auto *def : recordKeeper.getAllDerivedDefinitions("Pass")) {
+    Pass pass(def);
+    passes.push_back(pass);
+    emitPassDecl(pass, os);
+  }
+  emitRegistration(passes, os);
+}
+
+static mlir::GenRegistration
+    genRegister("gen-pass-decls", "Generate operation documentation",
+                [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                  emitDecls(records, os);
+                  return false;
+                });
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index c2d5e329f0a5..04755a6a3e73 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -18,6 +18,8 @@
 
 #include "amdgcn_interface.h"
 
+#include <assert.h>
+#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
index b232490c43c2..d39ef9d05244 100644
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@@ -127,7 +127,6 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
-#include <stdio.h>
 #include "common/support.h"
 
 template <typename... Arguments>
@@ -138,7 +137,6 @@ NOINLINE static void log(const char *fmt, Arguments... parameters) {
 
 #endif
 #if OMPTARGET_NVPTX_TEST
-#include <assert.h>
 
 template <typename... Arguments>
 NOINLINE static void check(bool cond, const char *fmt,
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index d17edb05e6aa..7985691bb8ef 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -14,11 +14,6 @@
 #ifndef OMPTARGET_H
 #define OMPTARGET_H
 
-// std includes
-#include <inttypes.h>
-#include <math.h>
-
-// local includes
 #include "target_impl.h"
 #include "common/debug.h"     // debug
 #include "interface.h" // interfaces with omp, compiler, and user
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
index c259c7707895..f6523c8ce8aa 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 #include "common/omptarget.h"
 #include "target_impl.h"
-#include <stdio.h>
 
 // Return true if this is the master thread.
 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 04d510b11591..1b966510ec7e 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -12,7 +12,10 @@
 #ifndef _TARGET_IMPL_H_
 #define _TARGET_IMPL_H_
 
+#include <assert.h>
 #include <cuda.h>
+#include <inttypes.h>
+#include <stdio.h>
 #include <stdlib.h>
 
 #include "nvptx_interface.h"